+# Code taken from ActiveState Python recipes:
+# http://code.activestate.com/recipes/389023/
+#
+# Changed by Marek Stepniowski <marek@stepniowski.com> to handle unicode characters
+import xml.sax
+
+
+class simpleHandler (xml.sax.ContentHandler):
+ """A simple handler that provides us with indices of marked up content."""
+ def __init__ (self):
+ self.elements = [] #this will contain a list of elements and their start/end indices
+ self.open_elements = [] #this holds info on open elements while we wait for their close
+ self.content = ""
+
+ def startElement (self,name,attrs):
+ if name=='foobar': return # we require an outer wrapper, which we promptly ignore.
+ self.open_elements.append({'name':name,
+ 'attrs':attrs.copy(),
+ 'start':len(self.content),
+ })
+
+ def endElement (self, name):
+ if name=='foobar': return # we require an outer wrapper, which we promptly ignore.
+ for i in range(len(self.open_elements)):
+ e = self.open_elements[i]
+ if e['name']==name:
+ # append a (start,end), name, attrs
+ self.elements.append(((e['start'], #start position
+ len(self.content)),# current (end) position
+ e['name'],e['attrs'])
+ )
+ del self.open_elements[i]
+ return
+
+ def characters (self, chunk):
+ self.content += chunk
+
+
+class MarkupString (unicode):
+ """A simple class for dealing with marked up strings. When we are sliced, we return
+ valid marked up strings, preserving markup."""
+ def __init__ (self, string):
+ unicode.__init__(self, string)
+ self.handler = simpleHandler()
+ xml.sax.parseString((u"<foobar>%s</foobar>" % string).encode('utf-8'), self.handler)
+ self.raw = self.handler.content
+
+ def __getitem__ (self, n):
+ return self.__getslice__(n,n+1)
+
+ def __getslice__ (self, s, e):
+ # only include relevant elements
+ if not e or e > len(self.raw): e = len(self.raw)
+ elements = filter(lambda tp: (tp[0][1] >= s and # end after the start...
+ tp[0][0] <= e # and start before the end
+ ),
+ self.handler.elements)
+ ends = {}
+ starts = {}
+ for el in elements:
+ # cycle through elements that effect our slice and keep track of
+ # where their start and end tags should go.
+ pos = el[0]
+ name = el[1]
+ attrs = el[2]
+ # write our start tag <stag att="val"...>
+ stag = "<%s"%name
+ for k,v in attrs.items(): stag += " %s=%s"%(k,xml.sax.saxutils.quoteattr(v))
+ stag += ">"
+ etag = "</%s>"%name # simple end tag
+ spos = pos[0]
+ epos = pos[1]
+ if spos < s: spos=s
+ if epos > e: epos=e
+ if epos != spos: # we don't care about tags that don't markup any text
+ if not starts.has_key(spos): starts[spos]=[]
+ starts[spos].append(stag)
+ if not ends.has_key(epos): ends[epos]=[]
+ ends[epos].append(etag)
+ outbuf = "" # our actual output string
+ for pos in range(s,e): # we move through positions
+ char = self.raw[pos]
+ if ends.has_key(pos): # if there are endtags to insert...
+ for et in ends[pos]: outbuf += et
+ if starts.has_key(pos): # if there are start tags to insert
+ mystarts = starts[pos]
+ # reverse these so the order works out,e.g. <i><b><u></u></b></i>
+ mystarts.reverse()
+ for st in mystarts: outbuf += st
+ outbuf += char
+ if ends.has_key(e):
+ for et in ends[e]: outbuf+= et
+ return MarkupString(outbuf)
+