e992f40716ea2cb5fb6b23e33a68d50deb69cc6b
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from copy import deepcopy
12 from lxml import etree
13 import zipfile
14 from tempfile import mkdtemp
15 from shutil import rmtree
16
17 import sys
18 sys.path.append('..') # for running from working copy
19
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
21 from librarian.dcparser import BookInfo
22
23 from librarian import functions
24
25 functions.reg_person_name()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node]) 
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
78 def res(fname):
79     return os.path.join(_resdir, fname)
80
81
82 def replace_characters(node):
83     def replace_chars(text):
84         if text is None:
85             return None
86         return text.replace("---", u"\u2014")\
87                    .replace("--", u"\u2013")\
88                    .replace(",,", u"\u201E")\
89                    .replace('"', u"\u201D")\
90                    .replace("'", u"\u2019")
91     if node.tag == 'extra':
92         node.clear()
93     else:
94         node.text = replace_chars(node.text)
95         node.tail = replace_chars(node.tail)
96         for child in node:
97             replace_characters(child)
98
99
100 def find_annotations(annotations, source, part_no):
101     for child in source:
102         if child.tag in ('pe', 'pa', 'pt', 'pr'):
103             annotation = deepcopy(child)
104             number = str(len(annotations)+1)
105             annotation.set('number', number)
106             annotation.set('part', str(part_no))
107             annotation.tail = ''
108             annotations.append(annotation)
109             tail = child.tail
110             child.clear()
111             child.tail = tail
112             child.text = number
113         if child.tag not in ('extra', 'podtytul'):
114             find_annotations(annotations, child, part_no)
115
116
117 def replace_by_verse(tree):
118     """ Find stanzas and create new verses in place of a '/' character """
119
120     stanzas = tree.findall('.//' + WLNS('strofa'))
121     for node in stanzas:
122         for child_node in node:
123             if child_node.tag in ('slowo_obce', 'wyroznienie'):
124                 foreign_verses = inner_xml(child_node).split('/\n')
125                 if len(foreign_verses) > 1:
126                     new_foreign = ''
127                     for foreign_verse in foreign_verses:
128                         if foreign_verse.startswith('<wers'):
129                             new_foreign += foreign_verse
130                         else:
131                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
132                     set_inner_xml(child_node, new_foreign)
133         verses = inner_xml(node).split('/\n')
134         if len(verses) > 1:
135             modified_inner_xml = ''
136             for verse in verses:
137                 if verse.startswith('<wers') or verse.startswith('<extra'):
138                     modified_inner_xml += verse
139                 else:
140                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
141             set_inner_xml(node, modified_inner_xml)
142
143
144 def add_to_manifest(manifest, partno):
145     """ Adds a node to the manifest section in content.opf file """
146
147     partstr = 'part%d' % partno
148     e = manifest.makeelement(OPFNS('item'), attrib={
149                                  'id': partstr,
150                                  'href': partstr + '.html',
151                                  'media-type': 'application/xhtml+xml',
152                              })
153     manifest.append(e)
154
155
156 def add_to_spine(spine, partno):
157     """ Adds a node to the spine section in content.opf file """
158
159     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
160     spine.append(e)
161
162
163 class TOC(object):
164     def __init__(self, name=None, part_number=None):
165         self.children = []
166         self.name = name
167         self.part_number = part_number
168         self.sub_number = None
169
170     def add(self, name, part_number, level=0, is_part=True):
171         if level > 0 and self.children:
172             return self.children[-1].add(name, part_number, level-1, is_part)
173         else:
174             t = TOC(name)
175             t.part_number = part_number
176             self.children.append(t)
177             if not is_part:
178                 t.sub_number = len(self.children) + 1
179                 return t.sub_number
180
181     def append(self, toc):
182         self.children.append(toc)
183
184     def extend(self, toc):
185         self.children.extend(toc.children)
186
187     def depth(self):
188         if self.children:
189             return max((c.depth() for c in self.children)) + 1
190         else:
191             return 0
192
193     def write_to_xml(self, nav_map, counter):
194         for child in self.children:
195             nav_point = nav_map.makeelement(NCXNS('navPoint'))
196             nav_point.set('id', 'NavPoint-%d' % counter)
197             nav_point.set('playOrder', str(counter))
198
199             nav_label = nav_map.makeelement(NCXNS('navLabel'))
200             text = nav_map.makeelement(NCXNS('text'))
201             text.text = child.name
202             nav_label.append(text)
203             nav_point.append(nav_label)
204
205             content = nav_map.makeelement(NCXNS('content'))
206             src = 'part%d.html' % child.part_number
207             if child.sub_number is not None:
208                 src += '#sub%d' % child.sub_number
209             content.set('src', src)
210             nav_point.append(content)
211             nav_map.append(nav_point)
212             counter = child.write_to_xml(nav_point, counter + 1)
213         return counter
214
215
216 def used_chars(element):
217     """ Lists characters used in an ETree Element """
218     chars = set((element.text or '') + (element.tail or ''))
219     for child in element:
220         chars = chars.union(used_chars(child))
221     return chars
222
223
224 def chop(main_text):
225     """ divide main content of the XML file into chunks """
226
227     # prepare a container for each chunk
228     part_xml = etree.Element('utwor')
229     etree.SubElement(part_xml, 'master')
230     main_xml_part = part_xml[0] # master
231
232     last_node_part = False
233     for one_part in main_text:
234         name = one_part.tag
235         if name == 'naglowek_czesc':
236             yield part_xml
237             last_node_part = True
238             main_xml_part[:] = [deepcopy(one_part)]
239         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
240             yield part_xml
241             main_xml_part[:] = [deepcopy(one_part)]
242         else:
243             main_xml_part.append(deepcopy(one_part))
244             last_node_part = False
245     yield part_xml
246
247
248 def transform_chunk(chunk_xml, chunk_no, annotations):
249     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
250
251     toc = TOC()
252     for element in chunk_xml[0]:
253         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
254             toc.add(node_name(element), chunk_no)
255         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
256             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
257             element.set('sub', str(subnumber))
258     find_annotations(annotations, chunk_xml, chunk_no)
259     replace_by_verse(chunk_xml)
260     html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
261     chars = used_chars(html_tree.getroot())
262     output_html = etree.tostring(html_tree, pretty_print=True)
263     return output_html, toc, chars
264
265
266 def transform(provider, slug, output_file=None, output_dir=None):
267     """ produces an epub
268
269     provider is a DocProvider
270     either output_file (a file-like object) or output_dir (path to file/dir) should be specified
271     if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
272     """
273
274     def transform_file(input_xml, chunk_counter=1, first=True):
275         """ processes one input file and proceeds to its children """
276
277         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
278
279         # every input file will have a TOC entry,
280         # pointing to starting chunk
281         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
282         chars = set()
283         if first:
284             # write book title page
285             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
286             chars = used_chars(html_tree.getroot())
287             zip.writestr('OPS/title.html',
288                  etree.tostring(html_tree, pretty_print=True))
289         elif children:
290             # write title page for every parent
291             html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
292             chars = used_chars(html_tree.getroot())
293             zip.writestr('OPS/part%d.html' % chunk_counter, 
294                 etree.tostring(html_tree, pretty_print=True))
295             add_to_manifest(manifest, chunk_counter)
296             add_to_spine(spine, chunk_counter)
297             chunk_counter += 1
298
299         if len(input_xml.getroot()) > 1:
300             # rdf before style master
301             main_text = input_xml.getroot()[1]
302         else:
303             # rdf in style master
304             main_text = input_xml.getroot()[0]
305             if main_text.tag == RDFNS('RDF'):
306                 main_text = None
307
308         if main_text is not None:
309             replace_characters(main_text)
310
311             for chunk_xml in chop(main_text):
312                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
313                 toc.extend(chunk_toc)
314                 chars = chars.union(chunk_chars)
315                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
316                 add_to_manifest(manifest, chunk_counter)
317                 add_to_spine(spine, chunk_counter)
318                 chunk_counter += 1
319
320         if children:
321             for child in children:
322                 child_xml = etree.parse(provider.by_uri(child))
323                 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
324                 toc.append(child_toc)
325                 chars = chars.union(chunk_chars)
326
327         return toc, chunk_counter, chars
328
329     # read metadata from the first file
330     input_xml = etree.parse(provider[slug])
331     metadata = input_xml.find('.//'+RDFNS('Description'))
332     if metadata is None:
333         raise NoDublinCore('Document has no DublinCore - which is required.')
334     book_info = BookInfo.from_element(input_xml)
335     metadata = etree.ElementTree(metadata)
336
337     # if output to dir, create the file
338     if output_dir is not None:
339         author = unicode(book_info.author)
340         author_dir = os.path.join(output_dir, author)
341         try:
342             os.makedirs(author_dir)
343         except OSError:
344             pass
345         output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
346
347
348     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
349
350     # write static elements
351     mime = zipfile.ZipInfo()
352     mime.filename = 'mimetype'
353     mime.compress_type = zipfile.ZIP_STORED
354     mime.extra = ''
355     zip.writestr(mime, 'application/epub+zip')
356     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
357                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
358                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
359                        'media-type="application/oebps-package+xml" />' \
360                        '</rootfiles></container>')
361     for fname in 'style.css', 'logo_wolnelektury.png':
362         zip.write(res(fname), os.path.join('OPS', fname))
363
364     opf = xslt(metadata, res('xsltContent.xsl'))
365     manifest = opf.find('.//' + OPFNS('manifest'))
366     spine = opf.find('.//' + OPFNS('spine'))
367
368     annotations = etree.Element('annotations')
369
370     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
371                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
372                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
373                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
374                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
375                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
376                                '</navPoint></navMap></ncx>')
377     nav_map = toc_file[-1]
378
379     toc, chunk_counter, chars = transform_file(input_xml)
380
381     if not toc.children:
382         toc.add(u"Początek utworu", 1)
383     toc_counter = toc.write_to_xml(nav_map, 2)
384
385     # Last modifications in container files and EPUB creation
386     if len(annotations) > 0:
387         nav_map.append(etree.fromstring(
388             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
389             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
390         manifest.append(etree.fromstring(
391             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
392         spine.append(etree.fromstring(
393             '<itemref idref="annotations" />'))
394         replace_by_verse(annotations)
395         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
396         chars = chars.union(used_chars(html_tree.getroot()))
397         zip.writestr('OPS/annotations.html', etree.tostring(
398                             html_tree, pretty_print=True))
399
400     # strip fonts
401     tmpdir = mkdtemp('-librarian-epub')
402     cwd = os.getcwd()
403
404     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../font-optimizer'))
405     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
406         subprocess.check_call(['./subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
407         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
408     rmtree(tmpdir)
409     os.chdir(cwd)
410
411     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
412     contents = []
413     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
414     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
415     for st in attributes:
416         meta = toc_file.makeelement(NCXNS('meta'))
417         meta.set('name', st)
418         meta.set('content', '0')
419         toc_file[0].append(meta)
420     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
421     toc_file[0][1].set('content', str(toc.depth()))
422     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
423     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
424     zip.close()
425
426
427 if __name__ == '__main__':
428     from librarian import DirDocProvider
429
430     if len(sys.argv) < 2:
431         print >> sys.stderr, 'Usage: python epub.py <input file>'
432         sys.exit(1)
433
434     main_input = sys.argv[1]
435     basepath, ext = os.path.splitext(main_input)
436     path, slug = os.path.realpath(basepath).rsplit('/', 1)
437     provider = DirDocProvider(path)
438     transform(provider, slug, output_dir=path)
439