2 # -*- coding: utf-8 -*-
 
  10 class Fragment(object):
 
  11     def __init__(self, id, themes):
 
  12         super(Fragment, self).__init__()
 
  17     def append(self, event, element):
 
  18         self.events.append((event, element))
 
  20     def closed_events(self):
 
  22         for event, element in self.events:
 
  24                 stack.append(('end', element))
 
  29                     print 'CLOSED NON-OPEN TAG:', element
 
  32         return self.events + stack
 
  36         for event, element in self.closed_events():
 
  38                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 
  40                     result.append(element.text)
 
  42                 result.append(u'</%s>' % element.tag)
 
  44                     result.append(element.tail)
 
  46                 result.append(element)
 
  48         return ''.join(result)
 
  50     def __unicode__(self):
 
  51         return self.to_string()
 
  54 def extract_fragments(input_filename):
 
  55     """Extracts theme fragments from input_filename."""
 
  59     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
 
  60         # Process begin and end elements
 
  61         if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'):
 
  62             if not event == 'end': continue # Process elements only once, on end event
 
  65             if element.get('class', '') == 'theme-begin':
 
  66                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 
  69                 if element.getparent().tag != 'body':
 
  70                     parents = [element.getparent()]
 
  71                     while parents[-1].getparent().tag != 'body':
 
  72                         parents.append(parents[-1].getparent())
 
  75                     for parent in parents:
 
  76                         fragment.append('start', parent)
 
  78                 open_fragments[fragment.id] = fragment
 
  80             # Close existing fragment
 
  83                     fragment = open_fragments[element.get('fid')]
 
  85                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 
  87                     closed_fragments[fragment.id] = fragment
 
  88                     del open_fragments[fragment.id]
 
  90             # Append element tail to lost_text (we don't want to lose any text)
 
  92                 for fragment_id in open_fragments:
 
  93                     open_fragments[fragment_id].append('text', element.tail)
 
  96         # Process all elements except begin and end
 
  98             # Omit annotation tags
 
  99             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 
 100                 if event == 'end' and element.tail:
 
 101                     for fragment_id in open_fragments:
 
 102                         open_fragments[fragment_id].append('text', element.tail)
 
 104                 for fragment_id in open_fragments:
 
 105                     open_fragments[fragment_id].append(event, copy.copy(element))
 
 107     return closed_fragments, open_fragments
 
 110 if __name__ == '__main__':
 
 111     # Parse commandline arguments
 
 112     usage = """Usage: %prog [options] SOURCE [SOURCE...]
 
 113     Extract theme fragments from SOURCE."""
 
 115     parser = optparse.OptionParser(usage=usage)
 
 117     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
 
 118         help='print status messages to stdout')
 
 120     options, input_filenames = parser.parse_args()
 
 122     if len(input_filenames) < 1:
 
 127     for input_filename in input_filenames:
 
 131         output_filename = os.path.splitext(input_filename)[0] + '.fragments.html'
 
 133         closed_fragments, open_fragments = extract_fragments(input_filename)
 
 135         for fragment_id in open_fragments:
 
 136             print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id)
 
 138         output_file = open(output_filename, 'w')
 
 139         output_file.write("""
 
 140             <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
 
 142                 <title>bookfragments output</title>
 
 143                 <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
 
 144                 <link rel="stylesheet" href="master.css" type="text/css" media="screen" charset="utf-8" />
 
 147         for fragment in closed_fragments.values():
 
 148             html = u'<div class="fragment"><h3>[#%s] %s</h3>%s</div>' % (fragment.id, fragment.themes, fragment)
 
 149             output_file.write(html.encode('utf-8'))
 
 150         output_file.write('</body></html>')