1 # Code taken from ActiveState Python recipes:
2 # http://code.activestate.com/recipes/389023/
4 # Changed by Marek Stepniowski <marek@stepniowski.com> to handle unicode characters
8 class simpleHandler(xml.sax.ContentHandler):
9 """A simple handler that provides us with indices of marked up content."""
11 self.elements = [] #this will contain a list of elements and their start/end indices
12 self.open_elements = [] #this holds info on open elements while we wait for their close
15 def startElement(self, name, attrs):
16 if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
17 self.open_elements.append({'name':name,
19 'start':len(self.content),
22 def endElement(self, name):
23 if name == 'foobar': return # we require an outer wrapper, which we promptly ignore.
24 for i in range(len(self.open_elements)):
25 e = self.open_elements[i]
27 # append a (start,end), name, attrs
28 self.elements.append(((e['start'], #start position
29 len(self.content)), # current (end) position
30 e['name'], e['attrs'])
32 del self.open_elements[i]
35 def characters(self, chunk):
39 class MarkupString(unicode):
40 """A simple class for dealing with marked up strings. When we are sliced, we return
41 valid marked up strings, preserving markup."""
42 def __init__(self, string):
43 unicode.__init__(self)
44 self.handler = simpleHandler()
45 xml.sax.parseString((u"<foobar>%s</foobar>" % string).encode('utf-8'), self.handler)
46 self.raw = self.handler.content
48 def __getitem__(self, n):
49 return self.__getslice__(n, n + 1)
51 def __getslice__(self, s, e):
52 # only include relevant elements
53 if not e or e > len(self.raw): e = len(self.raw)
54 elements = filter(lambda tp: (tp[0][1] >= s and # end after the start...
55 tp[0][0] <= e # and start before the end
57 self.handler.elements)
61 # cycle through elements that effect our slice and keep track of
62 # where their start and end tags should go.
66 # write our start tag <stag att="val"...>
68 for k, v in attrs.items(): stag += " %s=%s" % (k, xml.sax.saxutils.quoteattr(v))
70 etag = "</%s>" % name # simple end tag
75 if epos != spos: # we don't care about tags that don't markup any text
76 if not starts.has_key(spos): starts[spos] = []
77 starts[spos].append(stag)
78 if not ends.has_key(epos): ends[epos] = []
79 ends[epos].append(etag)
80 outbuf = "" # our actual output string
81 for pos in range(s, e): # we move through positions
83 if ends.has_key(pos): # if there are endtags to insert...
84 for et in ends[pos]: outbuf += et
85 if starts.has_key(pos): # if there are start tags to insert
86 mystarts = starts[pos]
87 # reverse these so the order works out,e.g. <i><b><u></u></b></i>
89 for st in mystarts: outbuf += st
92 for et in ends[e]: outbuf += et
93 return MarkupString(outbuf)