-# Code taken from ActiveState Python recipes:
-# http://code.activestate.com/recipes/389023/
-#
-# Changed by Marek Stepniowski <marek@stepniowski.com> to handle unicode characters
-import xml.sax
-
-
-class simpleHandler(xml.sax.ContentHandler):
- """A simple handler that provides us with indices of marked up content."""
- def __init__(self):
- self.elements = [] #this will contain a list of elements and their start/end indices
- self.open_elements = [] #this holds info on open elements while we wait for their close
- self.content = ""
-
- def startElement(self, name, attrs):
- if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
- self.open_elements.append({'name':name,
- 'attrs':attrs.copy(),
- 'start':len(self.content),
- })
-
- def endElement(self, name):
- if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
- for i in range(len(self.open_elements)):
- e = self.open_elements[i]
- if e['name'] == name:
- # append a (start,end), name, attrs
- self.elements.append(((e['start'], #start position
- len(self.content)), # current (end) position
- e['name'], e['attrs'])
- )
- del self.open_elements[i]
- return
-
- def characters(self, chunk):
- self.content += chunk
-
-
-class MarkupString(unicode):
- """A simple class for dealing with marked up strings. When we are sliced, we return
- valid marked up strings, preserving markup."""
- def __init__(self, string):
- unicode.__init__(self)
- self.handler = simpleHandler()
- xml.sax.parseString((u"<foobar>%s</foobar>" % string).encode('utf-8'), self.handler)
- self.raw = self.handler.content
-
- def __getitem__(self, n):
- return self.__getslice__(n, n + 1)
-
- def __getslice__(self, s, e):
- # only include relevant elements
- if not e or e > len(self.raw): e = len(self.raw)
- elements = filter(lambda tp: (tp[0][1] >= s and # end after the start...
- tp[0][0] <= e # and start before the end
- ),
- self.handler.elements)
- ends = {}
- starts = {}
- for el in elements:
- # cycle through elements that effect our slice and keep track of
- # where their start and end tags should go.
- pos = el[0]
- name = el[1]
- attrs = el[2]
- # write our start tag <stag att="val"...>
- stag = "<%s" % name
- for k, v in attrs.items(): stag += " %s=%s" % (k, xml.sax.saxutils.quoteattr(v))
- stag += ">"
- etag = "</%s>" % name # simple end tag
- spos = pos[0]
- epos = pos[1]
- if spos < s: spos = s
- if epos > e: epos = e
- if epos != spos: # we don't care about tags that don't markup any text
- if not starts.has_key(spos): starts[spos] = []
- starts[spos].append(stag)
- if not ends.has_key(epos): ends[epos] = []
- ends[epos].append(etag)
- outbuf = "" # our actual output string
- for pos in range(s, e): # we move through positions
- char = self.raw[pos]
- if ends.has_key(pos): # if there are endtags to insert...
- for et in ends[pos]: outbuf += et
- if starts.has_key(pos): # if there are start tags to insert
- mystarts = starts[pos]
- # reverse these so the order works out,e.g. <i><b><u></u></b></i>
- mystarts.reverse()
- for st in mystarts: outbuf += st
- outbuf += char
- if ends.has_key(e):
- for et in ends[e]: outbuf += et
- return MarkupString(outbuf)
-
- def __len__(self):
- return len(self.raw)
-