Added book2html and bookfragments utilities to repository.
[wolnelektury.git] / bin / bookfragments.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 import optparse
4 import os
5 import copy
6
7 from lxml import etree
8
9
10 # Parse args
11 usage = """Usage: %prog [options] SOURCE [SOURCE...]
12 Extract theme fragments from SOURCE."""
13
14 parser = optparse.OptionParser(usage=usage)
15
16 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
17     help='print status messages to stdout')
18
19 options, input_filenames = parser.parse_args()
20
21 if len(input_filenames) < 1:
22     parser.print_help()
23     exit(1)
24
25
26 class Fragment(object):
27     def __init__(self, id, themes):
28         super(Fragment, self).__init__()
29         self.id = id
30         self.themes = themes
31         self.events = []
32         
33     def append(self, event, element):
34         self.events.append((event, element))
35     
36     def closed_events(self):
37         stack = []
38         for event, element in self.events:
39             if event == 'start':
40                 stack.append(('end', element))
41             elif event == 'end':
42                 try:
43                     stack.pop()
44                 except IndexError:
45                     print 'CLOSED NON-OPEN TAG:', element
46         
47         stack.reverse()
48         return self.events + stack
49     
50     def to_string(self):
51         result = []
52         for event, element in self.closed_events():
53             if event == 'start':
54                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
55                 if element.text:
56                     result.append(element.text)
57             elif event == 'end':
58                 result.append(u'</%s>' % element.tag)
59                 if element.tail:
60                     result.append(element.tail)
61             else:
62                 result.append(element)
63         
64         return ''.join(result)
65     
66     def __unicode__(self):
67         return self.to_string()
68
69
70 # Do some real work
71 for input_filename in input_filenames:
72     if options.verbose:
73         print input_filename
74     
75     output_filename = os.path.splitext(input_filename)[0] + '.fragments.html'
76     
77     open_fragments = {}
78     closed_fragments = {}
79     lost_text = []
80     
81     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
82         
83         # Process begin and end elements
84         if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'):
85             if not event == 'end': continue # Process elements only once, on end event
86             
87             # Open new fragment
88             if element.get('class', '') == 'theme-begin':
89                 fragment = Fragment(id=element.get('fid'), themes=element.text)
90                 
91                 # Append parents
92                 if element.getparent().tag != 'body':
93                     parents = [element.getparent()]
94                     while parents[-1].getparent().tag != 'body':
95                         parents.append(parents[-1].getparent())
96                     
97                     parents.reverse()
98                     for parent in parents:
99                         fragment.append('start', parent)
100                 
101                 open_fragments[fragment.id] = fragment
102                     
103             # Close existing fragment
104             else:
105                 try:
106                     fragment = open_fragments[element.get('fid')]
107                 except KeyError:
108                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
109                 else:
110                     closed_fragments[fragment.id] = fragment
111                     del open_fragments[fragment.id]
112             
113             # Append element tail to lost_text (we don't want to lose any text)
114             if element.tail:
115                 for fragment_id in open_fragments:
116                     open_fragments[fragment_id].append('text', element.tail)
117         
118         
119         # Process all elements except begin and end
120         else:
121             # Omit annotation tags
122             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
123                 if event == 'end' and element.tail:
124                     for fragment_id in open_fragments:
125                         open_fragments[fragment_id].append('text', element.tail)
126             else:
127                 for fragment_id in open_fragments:
128                     open_fragments[fragment_id].append(event, copy.copy(element))
129
130
131     for fragment_id in open_fragments:
132         print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id)
133
134     output_file = open(output_filename, 'w')
135     output_file.write("""
136         <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
137         <html><head>
138             <title>bookfragments output</title>
139             <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
140             <link rel="stylesheet" href="master.css" type="text/css" media="screen" charset="utf-8" />
141         </head>
142         <body>""")
143     for fragment in closed_fragments.values():
144         html = u'<div class="fragment"><h3>[#%s] %s</h3>%s</div>' % (fragment.id, fragment.themes, fragment)
145         output_file.write(html.encode('utf-8'))
146     output_file.write('</body></html>')
147     output_file.close()
148