Added markupstring to lib folder.
authorMarek Stępniowski <marek@stepniowski.com>
Thu, 4 Sep 2008 20:18:17 +0000 (22:18 +0200)
committerMarek Stępniowski <marek@stepniowski.com>
Thu, 4 Sep 2008 20:18:17 +0000 (22:18 +0200)
lib/markupstring.py [new file with mode: 0644]

diff --git a/lib/markupstring.py b/lib/markupstring.py
new file mode 100644 (file)
index 0000000..f31e148
--- /dev/null
@@ -0,0 +1,94 @@
+# Code taken from ActiveState Python recipes:
+# http://code.activestate.com/recipes/389023/
+#
+# Changed by Marek Stepniowski <marek@stepniowski.com> to handle unicode characters
+import xml.sax
+
+
+class simpleHandler (xml.sax.ContentHandler):
+    """A simple handler that provides us with indices of marked up content."""
+    def __init__ (self):        
+        self.elements = [] #this will contain a list of elements and their start/end indices
+        self.open_elements = [] #this holds info on open elements while we wait for their close
+        self.content = ""
+
+    def startElement (self,name,attrs):
+        if name=='foobar': return # we require an outer wrapper, which we promptly ignore.
+        self.open_elements.append({'name':name,
+                                   'attrs':attrs.copy(),
+                                   'start':len(self.content),
+                                   })
+
+    def endElement (self, name):
+        if name=='foobar': return # we require an outer wrapper, which we promptly ignore.
+        for i in range(len(self.open_elements)):
+            e = self.open_elements[i]
+            if e['name']==name:
+                # append a  (start,end), name, attrs
+                self.elements.append(((e['start'], #start position
+                                       len(self.content)),# current (end) position
+                                      e['name'],e['attrs'])
+                                     )
+                del self.open_elements[i]
+                return
+
+    def characters (self, chunk):
+        self.content += chunk
+
+
+class MarkupString (unicode):
+    """A simple class for dealing with marked up strings. When we are sliced, we return
+    valid marked up strings, preserving markup."""
+    def __init__ (self, string):        
+        unicode.__init__(self, string)
+        self.handler = simpleHandler()
+        xml.sax.parseString((u"<foobar>%s</foobar>" % string).encode('utf-8'), self.handler)
+        self.raw = self.handler.content
+
+    def __getitem__ (self, n):
+        return self.__getslice__(n,n+1)
+
+    def __getslice__ (self, s, e):
+        # only include relevant elements
+        if not e or e > len(self.raw): e = len(self.raw)
+        elements = filter(lambda tp: (tp[0][1] >= s and # end after the start...
+                                      tp[0][0] <= e # and start before the end
+                                      ),
+                          self.handler.elements)
+        ends = {}
+        starts = {}
+        for el in elements:
+            # cycle through elements that effect our slice and keep track of
+            # where their start and end tags should go.
+            pos = el[0]
+            name = el[1]
+            attrs = el[2]
+            # write our start tag <stag att="val"...>
+            stag = "<%s"%name
+            for k,v in attrs.items(): stag += " %s=%s"%(k,xml.sax.saxutils.quoteattr(v))
+            stag += ">"
+            etag = "</%s>"%name # simple end tag
+            spos = pos[0]
+            epos = pos[1]
+            if spos < s: spos=s
+            if epos > e: epos=e
+            if epos != spos: # we don't care about tags that don't markup any text
+                if not starts.has_key(spos): starts[spos]=[]
+                starts[spos].append(stag)
+                if not ends.has_key(epos): ends[epos]=[]
+                ends[epos].append(etag)
+        outbuf = "" # our actual output string
+        for pos in range(s,e): # we move through positions
+            char = self.raw[pos]
+            if ends.has_key(pos):  # if there are endtags to insert...
+                for et in ends[pos]: outbuf += et
+            if starts.has_key(pos): # if there are start tags to insert
+                mystarts = starts[pos]
+                # reverse these so the order works out,e.g. <i><b><u></u></b></i>
+                mystarts.reverse()
+                for st in mystarts: outbuf += st
+            outbuf += char
+        if ends.has_key(e):
+            for et in ends[e]: outbuf+= et
+        return MarkupString(outbuf)
+