Added handling of tags motto_podpis, podtytul and nota.
[wolnelektury.git] / bin / bookfragments.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 import optparse
4 import os
5 import copy
6
7 from lxml import etree
8
9
10 class Fragment(object):
11     def __init__(self, id, themes):
12         super(Fragment, self).__init__()
13         self.id = id
14         self.themes = themes
15         self.events = []
16         
17     def append(self, event, element):
18         self.events.append((event, element))
19     
20     def closed_events(self):
21         stack = []
22         for event, element in self.events:
23             if event == 'start':
24                 stack.append(('end', element))
25             elif event == 'end':
26                 try:
27                     stack.pop()
28                 except IndexError:
29                     print 'CLOSED NON-OPEN TAG:', element
30         
31         stack.reverse()
32         return self.events + stack
33     
34     def to_string(self):
35         result = []
36         for event, element in self.closed_events():
37             if event == 'start':
38                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
39                 if element.text:
40                     result.append(element.text)
41             elif event == 'end':
42                 result.append(u'</%s>' % element.tag)
43                 if element.tail:
44                     result.append(element.tail)
45             else:
46                 result.append(element)
47         
48         return ''.join(result)
49     
50     def __unicode__(self):
51         return self.to_string()
52
53
54 def extract_fragments(input_filename):
55     """Extracts theme fragments from input_filename."""
56     open_fragments = {}
57     closed_fragments = {}
58     
59     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
60         # Process begin and end elements
61         if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'):
62             if not event == 'end': continue # Process elements only once, on end event
63             
64             # Open new fragment
65             if element.get('class', '') == 'theme-begin':
66                 fragment = Fragment(id=element.get('fid'), themes=element.text)
67                 
68                 # Append parents
69                 if element.getparent().tag != 'body':
70                     parents = [element.getparent()]
71                     while parents[-1].getparent().tag != 'body':
72                         parents.append(parents[-1].getparent())
73                     
74                     parents.reverse()
75                     for parent in parents:
76                         fragment.append('start', parent)
77                 
78                 open_fragments[fragment.id] = fragment
79                     
80             # Close existing fragment
81             else:
82                 try:
83                     fragment = open_fragments[element.get('fid')]
84                 except KeyError:
85                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
86                 else:
87                     closed_fragments[fragment.id] = fragment
88                     del open_fragments[fragment.id]
89             
90             # Append element tail to lost_text (we don't want to lose any text)
91             if element.tail:
92                 for fragment_id in open_fragments:
93                     open_fragments[fragment_id].append('text', element.tail)
94         
95         
96         # Process all elements except begin and end
97         else:
98             # Omit annotation tags
99             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
100                 if event == 'end' and element.tail:
101                     for fragment_id in open_fragments:
102                         open_fragments[fragment_id].append('text', element.tail)
103             else:
104                 for fragment_id in open_fragments:
105                     open_fragments[fragment_id].append(event, copy.copy(element))
106         
107     return closed_fragments, open_fragments
108
109
110 if __name__ == '__main__':
111     # Parse commandline arguments
112     usage = """Usage: %prog [options] SOURCE [SOURCE...]
113     Extract theme fragments from SOURCE."""
114     
115     parser = optparse.OptionParser(usage=usage)
116     
117     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
118         help='print status messages to stdout')
119     
120     options, input_filenames = parser.parse_args()
121     
122     if len(input_filenames) < 1:
123         parser.print_help()
124         exit(1)
125     
126     # Do some real work
127     for input_filename in input_filenames:
128         if options.verbose:
129             print input_filename
130     
131         output_filename = os.path.splitext(input_filename)[0] + '.fragments.html'
132     
133         closed_fragments, open_fragments = extract_fragments(input_filename)
134
135         for fragment_id in open_fragments:
136             print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id)
137
138         output_file = open(output_filename, 'w')
139         output_file.write("""
140             <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
141             <html><head>
142                 <title>bookfragments output</title>
143                 <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
144                 <link rel="stylesheet" href="master.css" type="text/css" media="screen" charset="utf-8" />
145             </head>
146             <body>""")
147         for fragment in closed_fragments.values():
148             html = u'<div class="fragment"><h3>[#%s] %s</h3>%s</div>' % (fragment.id, fragment.themes, fragment)
149             output_file.write(html.encode('utf-8'))
150         output_file.write('</body></html>')
151         output_file.close()
152