e54e4cd8ae05791ceac890316913f31f0bdac4f5
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import shutil
11 import sys
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15
16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS
17
18 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
19 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
20
21
22 def inner_xml(node):
23     """ returns node's text and children as a string
24
25     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
26     x<b>y</b>z
27     """
28
29     nt = node.text if node.text is not None else ''
30     return ''.join([nt] + [etree.tostring(child) for child in node]) 
31
32 def set_inner_xml(node, text):
33     """ sets node's text and children from a string
34
35     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
36     >>> set_inner_xml(e, 'x<b>y</b>z')
37     >>> print etree.tostring(e)
38     <a>x<b>y</b>z</a>
39     """
40
41     
42     p = etree.fromstring('<x>%s</x>' % text)
43     node.text = p.text
44     node[:] = p[:]
45
46
47 def node_name(node):
48     """ Find out a node's name
49
50     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
51     XYZ
52     """
53
54     tempnode = deepcopy(node)
55
56     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
57         for e in tempnode.findall('.//%s' % p):
58             t = e.tail
59             e.clear()
60             e.tail = t
61     etree.strip_tags(tempnode, '*')
62     return tempnode.text
63
64
65 def xslt(xml, sheet):
66     if isinstance(xml, etree._Element):
67         xml = etree.ElementTree(xml)
68     with open(sheet) as xsltf:
69         return xml.xslt(etree.parse(xsltf))
70
71
72 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
73 def res(fname):
74     return os.path.join(_resdir, fname)
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace("&", "&amp;")\
82                    .replace("---", "&#8212;")\
83                    .replace("--", "&#8211;")\
84                    .replace(",,", "&#8222;")\
85                    .replace('"', "&#8221;")\
86                    .replace("'", "&#8217;")
87     if node.tag == 'extra':
88         node.clear()
89     else:
90         node.text = replace_chars(node.text)
91         node.tail = replace_chars(node.tail)
92         for child in node:
93             replace_characters(child)
94
95
96 def find_annotations(annotations, source, part_number):
97     for child in source:
98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
99             annotation = deepcopy(child)
100             annotation.set('number', str(len(annotations)+1))
101             annotation.set('part', str(part_number))
102             annotation.tail = ''
103             annotations.append(annotation)
104             tail = child.tail
105             child.clear()
106             child.tail = tail
107             child.text = str(len(annotations))
108         if child.tag not in ('extra', 'podtytul'):
109             find_annotations(annotations, child, part_number)
110
111
112 def replace_by_verse(tree):
113     """ Find stanzas and create new verses in place of a '/' character """
114     stanzas = tree.findall('.//' + WLNS('strofa'))
115     for node in stanzas:
116         for child_node in node:
117             if child_node.tag in ('slowo_obce', 'wyroznienie'):
118                 foreign_verses = inner_xml(child_node).split('/\n')
119                 if len(foreign_verses) > 1:
120                     new_foreign = ''
121                     for foreign_verse in foreign_verses:
122                         if foreign_verse.startswith('<wers'):
123                             new_foreign += foreign_verse
124                         else:
125                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
126                     set_inner_xml(child_node, new_foreign)
127         verses = inner_xml(node).split('/\n')
128         if len(verses) > 1:
129             modified_inner_xml = ''
130             for verse in verses:
131                 if verse.startswith('<wers') or verse.startswith('<extra'):
132                     modified_inner_xml += verse
133                 else:
134                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
135             set_inner_xml(node, modified_inner_xml)
136
137
138 def add_to_manifest(manifest, partno):
139     """ Adds a node to the manifest section in content.opf file """
140     partstr = 'part%d' % partno
141     e = manifest.makeelement(OPFNS('item'), attrib={
142                                  'id': partstr,
143                                  'href': partstr + '.html',
144                                  'media-type': 'application/xhtml+xml',
145                              })
146     manifest.append(e)
147
148
149 def add_to_spine(spine, partno):
150     """ Adds a node to the spine section in content.opf file """
151     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
152     spine.append(e)
153
154
155 def add_nav_point(nav_map, counter, title, part_counter):
156     nav_point = nav_map.makeelement(NCXNS('navPoint'))
157     nav_point.set('id', 'NavPoint-%d' % counter)
158     nav_point.set('playOrder', str(counter))
159
160     nav_label = nav_map.makeelement(NCXNS('navLabel'))
161     text = nav_map.makeelement(NCXNS('text'))
162     text.text = title
163     nav_label.append(text)
164     nav_point.append(nav_label)
165
166     content = nav_map.makeelement(NCXNS('content'))
167     content.set('src', 'part%d.html' % part_counter)
168     nav_point.append(content)
169
170     nav_map.append(nav_point)
171
172
173 def add_nav_point2(nav_map, counter, title, part_counter, subcounter):
174     nav_point = nav_map.makeelement(NCXNS('navPoint'))
175     nav_point.set('id', 'NavPoint-%d' % counter)
176     nav_point.set('playOrder', str(counter))
177
178     nav_label = nav_map.makeelement(NCXNS('navLabel'))
179     text = nav_map.makeelement(NCXNS('text'))
180     text.text = title
181     nav_label.append(text)
182     nav_point.append(nav_label)
183
184     content = nav_map.makeelement(NCXNS('content'))
185     content.set('src', 'part%d.html#sub%d' % (part_counter, subcounter))
186     nav_point.append(content)
187
188     nav_map[-1].append(nav_point)
189
190
191 def transform(input_file, output_file):
192     """ produces an epub
193
194     input_file and output_file should be filelike objects
195     """
196
197     input_xml = etree.parse(input_file)
198
199     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
200
201     mime = zipfile.ZipInfo()
202     mime.filename = 'mimetype'
203     mime.compress_type = zipfile.ZIP_STORED
204     mime.extra = ''
205     zip.writestr(mime, 'application/epub+zip')
206
207     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
208                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
209                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
210                        'media-type="application/oebps-package+xml" />' \
211                        '</rootfiles></container>')
212
213     metadata_el = input_xml.find('.//'+RDFNS('Description'))
214     metadatasource = etree.ElementTree(metadata_el)
215
216     opf = xslt(metadatasource, res('xsltContent.xsl'))
217
218     manifest = opf.find('.//' + OPFNS('manifest'))
219     spine = opf.find('.//' + OPFNS('spine'))
220
221     for fname in 'style.css', 'logo_wolnelektury.png':
222         zip.write(res(fname), os.path.join('OPS', fname))
223
224     annotations = etree.Element('annotations')
225     part_xml = etree.Element('utwor')
226     etree.SubElement(part_xml, 'master')
227
228     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
229                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
230                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
231                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
232                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
233                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
234                                '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
235                                '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
236                                '</navPoint></navMap></ncx>')
237
238     main_xml_part = part_xml[0] # było [0][0], master
239     nav_map = toc_file[-1] # było [-1][-1]
240     depth = 1 # navmap
241
242     if len(input_xml.getroot()) > 1:
243         # rdf before style master
244         main_text = input_xml.getroot()[1]
245     else:
246         # rdf in style master
247         main_text = input_xml.getroot()[0]
248
249     replace_characters(main_text)
250     zip.writestr('OPS/title.html', 
251                  etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
252
253     # Search for table of contents elements and book division
254
255     stupid_i = stupid_j = stupid_k = 1
256     last_node_part = False
257     for one_part in main_text:
258         name = one_part.tag
259         if name in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
260             if name == "naglowek_czesc":
261                 stupid_k = 1
262                 last_node_part = True
263                 find_annotations(annotations, part_xml, stupid_j)
264                 replace_by_verse(part_xml)
265                 zip.writestr('OPS/part%d.html' % stupid_j,
266                             etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
267                 main_xml_part[:] = [deepcopy(one_part)]
268                 # add to manifest and spine
269                 add_to_manifest(manifest, stupid_j)
270                 add_to_spine(spine, stupid_j)
271                 name_toc = node_name(one_part)
272                 # build table of contents
273                 # i+2 because of title page
274                 add_nav_point(nav_map, stupid_i+2, name_toc, stupid_j + 1)
275                 stupid_i += 1
276                 stupid_j += 1
277             else:
278                 if last_node_part:
279                     main_xml_part.append(one_part)
280                     last_node_part = False
281                     name_toc = node_name(one_part)
282                     add_nav_point(nav_map, stupid_i + 1, name_toc, stupid_j)
283                 else:
284                     stupid_k = 1
285                     find_annotations(annotations, part_xml, stupid_j)
286                     replace_by_verse(part_xml)
287                     zip.writestr('OPS/part%d.html' % stupid_j,
288                                  etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
289                     # start building a new part
290                     main_xml_part[:] = [deepcopy(one_part)]
291                     add_to_manifest(manifest, stupid_j)
292                     add_to_spine(spine, stupid_j)
293                     name_toc = node_name(one_part)
294                     add_nav_point(nav_map, stupid_i + 2, name_toc, stupid_j + 1) # title page
295                     stupid_j += 1
296                     stupid_i += 1
297         else:
298             if name in ('naglowek_podrozdzial', 'naglowek_scena'):
299                 depth = 2
300                 name_toc =  node_name(one_part)
301                 add_nav_point2(nav_map, stupid_i + 2, name_toc, stupid_j, stupid_k)
302                 one_part.set('sub', str(stupid_k))
303                 stupid_k += 1
304                 stupid_i += 1
305             main_xml_part.append(deepcopy(one_part))
306             last_node_part = False
307     find_annotations(annotations, part_xml, stupid_j)
308     replace_by_verse(part_xml)
309     add_to_manifest(manifest, stupid_j)
310     add_to_spine(spine, stupid_j)
311
312     zip.writestr('OPS/part%d.html' % stupid_j,
313                  etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
314
315     # Last modifications in container files and EPUB creation
316     if len(annotations) > 0:
317         nav_map.append(etree.fromstring(
318             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
319             '</navLabel><content src="annotations.html" /></navPoint>' % {'i':stupid_i+2}))
320         manifest.append(etree.fromstring(
321             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
322         spine.append(etree.fromstring(
323             '<itemref idref="annotations" />'))
324         replace_by_verse(annotations)
325         zip.writestr('OPS/annotations.html', etree.tostring(
326                             xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
327
328     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
329     contents = []
330     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
331     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
332     for st in attributes:
333         meta = toc_file.makeelement(NCXNS('meta'))
334         meta.set('name', st)
335         meta.set('content', '0')
336         toc_file[0].append(meta)
337     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
338     toc_file[0][1].set('content', str(depth))
339     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
340     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
341     zip.close()
342
343
344 if __name__ == '__main__':
345     import html
346
347     if len(sys.argv) < 2:
348         print >> sys.stderr, 'Usage: python epub.py <input file> [output file]'
349         sys.exit(1)
350
351     input = sys.argv[1]
352     if len(sys.argv) > 2:
353         output = sys.argv[2]
354     else:
355         basename, ext = os.path.splitext(input)
356         output = basename + '.epub' 
357
358     print input
359     if html.transform(input, is_file=True) == '<empty />':
360         print 'empty content - skipping'
361     else:
362         transform(open(input, 'r'), open(output, 'w'))
363
364
365