a615b7e97416ac69cc0e3b1e14374516262d5e2e
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15 from tempfile import mkdtemp
16 from shutil import rmtree
17
18 import sys
19
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
21 from librarian.dcparser import BookInfo
22
23 from librarian import functions, get_resource
24
25 functions.reg_person_name()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node])
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace(u"\ufeff", u"")\
82                    .replace("---", u"\u2014")\
83                    .replace("--", u"\u2013")\
84                    .replace(",,", u"\u201E")\
85                    .replace('"', u"\u201D")\
86                    .replace("'", u"\u2019")
87     node.text = replace_chars(node.text)
88     node.tail = replace_chars(node.tail)
89     for child in node:
90         replace_characters(child)
91
92
93 def find_annotations(annotations, source, part_no):
94     for child in source:
95         if child.tag in ('pe', 'pa', 'pt', 'pr'):
96             annotation = deepcopy(child)
97             number = str(len(annotations)+1)
98             annotation.set('number', number)
99             annotation.set('part', str(part_no))
100             annotation.tail = ''
101             annotations.append(annotation)
102             tail = child.tail
103             child.clear()
104             child.tail = tail
105             child.text = number
106         if child.tag not in ('extra', 'uwaga'):
107             find_annotations(annotations, child, part_no)
108
109
110 def replace_by_verse(tree):
111     """ Find stanzas and create new verses in place of a '/' character """
112
113     stanzas = tree.findall('.//' + WLNS('strofa'))
114     for node in stanzas:
115         for child_node in node:
116             if child_node.tag in ('slowo_obce', 'wyroznienie'):
117                 foreign_verses = inner_xml(child_node).split('/\n')
118                 if len(foreign_verses) > 1:
119                     new_foreign = ''
120                     for foreign_verse in foreign_verses:
121                         if foreign_verse.startswith('<wers'):
122                             new_foreign += foreign_verse
123                         else:
124                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
125                     set_inner_xml(child_node, new_foreign)
126         verses = inner_xml(node).split('/\n')
127         if len(verses) > 1:
128             modified_inner_xml = ''
129             for verse in verses:
130                 if verse.startswith('<wers') or verse.startswith('<extra'):
131                     modified_inner_xml += verse
132                 else:
133                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
134             set_inner_xml(node, modified_inner_xml)
135
136
137 def add_to_manifest(manifest, partno):
138     """ Adds a node to the manifest section in content.opf file """
139
140     partstr = 'part%d' % partno
141     e = manifest.makeelement(OPFNS('item'), attrib={
142                                  'id': partstr,
143                                  'href': partstr + '.html',
144                                  'media-type': 'application/xhtml+xml',
145                              })
146     manifest.append(e)
147
148
149 def add_to_spine(spine, partno):
150     """ Adds a node to the spine section in content.opf file """
151
152     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
153     spine.append(e)
154
155
156 class TOC(object):
157     def __init__(self, name=None, part_number=None):
158         self.children = []
159         self.name = name
160         self.part_number = part_number
161         self.sub_number = None
162
163     def add(self, name, part_number, level=0, is_part=True):
164         if level > 0 and self.children:
165             return self.children[-1].add(name, part_number, level-1, is_part)
166         else:
167             t = TOC(name)
168             t.part_number = part_number
169             self.children.append(t)
170             if not is_part:
171                 t.sub_number = len(self.children) + 1
172                 return t.sub_number
173
174     def append(self, toc):
175         self.children.append(toc)
176
177     def extend(self, toc):
178         self.children.extend(toc.children)
179
180     def depth(self):
181         if self.children:
182             return max((c.depth() for c in self.children)) + 1
183         else:
184             return 0
185
186     def write_to_xml(self, nav_map, counter):
187         for child in self.children:
188             nav_point = nav_map.makeelement(NCXNS('navPoint'))
189             nav_point.set('id', 'NavPoint-%d' % counter)
190             nav_point.set('playOrder', str(counter))
191
192             nav_label = nav_map.makeelement(NCXNS('navLabel'))
193             text = nav_map.makeelement(NCXNS('text'))
194             text.text = child.name
195             nav_label.append(text)
196             nav_point.append(nav_label)
197
198             content = nav_map.makeelement(NCXNS('content'))
199             src = 'part%d.html' % child.part_number
200             if child.sub_number is not None:
201                 src += '#sub%d' % child.sub_number
202             content.set('src', src)
203             nav_point.append(content)
204             nav_map.append(nav_point)
205             counter = child.write_to_xml(nav_point, counter + 1)
206         return counter
207
208
209 def used_chars(element):
210     """ Lists characters used in an ETree Element """
211     chars = set((element.text or '') + (element.tail or ''))
212     for child in element:
213         chars = chars.union(used_chars(child))
214     return chars
215
216
217 def chop(main_text):
218     """ divide main content of the XML file into chunks """
219
220     # prepare a container for each chunk
221     part_xml = etree.Element('utwor')
222     etree.SubElement(part_xml, 'master')
223     main_xml_part = part_xml[0] # master
224
225     last_node_part = False
226     for one_part in main_text:
227         name = one_part.tag
228         if name == 'naglowek_czesc':
229             yield part_xml
230             last_node_part = True
231             main_xml_part[:] = [deepcopy(one_part)]
232         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
233             yield part_xml
234             main_xml_part[:] = [deepcopy(one_part)]
235         else:
236             main_xml_part.append(deepcopy(one_part))
237             last_node_part = False
238     yield part_xml
239
240
241 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
242     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
243
244     toc = TOC()
245     for element in chunk_xml[0]:
246         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
247             toc.add(node_name(element), chunk_no)
248         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
249             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
250             element.set('sub', str(subnumber))
251     if empty:
252         if not _empty_html_static:
253             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
254         chars = set()
255         output_html = _empty_html_static[0]
256     else:
257         find_annotations(annotations, chunk_xml, chunk_no)
258         replace_by_verse(chunk_xml)
259         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
260         chars = used_chars(html_tree.getroot())
261         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
262     return output_html, toc, chars
263
264
265 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
266               sample=None, cover=None, flags=None):
267     """ produces a EPUB file
268
269     provider: a DocProvider
270     slug: slug of file to process, available by provider
271     output_file: file-like object or path to output file
272     output_dir: path to directory to save output file to; either this or output_file must be present
273     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
274     sample=n: generate sample e-book (with at least n paragraphs)
275     cover: a cover.Cover object
276     flags: less-advertising,
277     """
278
279     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
280         """ processes one input file and proceeds to its children """
281
282         replace_characters(input_xml.getroot())
283
284         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
285
286         # every input file will have a TOC entry,
287         # pointing to starting chunk
288         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
289         chars = set()
290         if first:
291             # write book title page
292             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
293             chars = used_chars(html_tree.getroot())
294             zip.writestr('OPS/title.html',
295                  etree.tostring(html_tree, method="html", pretty_print=True))
296         elif children:
297             # write title page for every parent
298             if sample is not None and sample <= 0:
299                 chars = set()
300                 html_string = open(get_resource('epub/emptyChunk.html')).read()
301             else:
302                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
303                 chars = used_chars(html_tree.getroot())
304                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
305             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
306             add_to_manifest(manifest, chunk_counter)
307             add_to_spine(spine, chunk_counter)
308             chunk_counter += 1
309
310         if len(input_xml.getroot()) > 1:
311             # rdf before style master
312             main_text = input_xml.getroot()[1]
313         else:
314             # rdf in style master
315             main_text = input_xml.getroot()[0]
316             if main_text.tag == RDFNS('RDF'):
317                 main_text = None
318
319         if main_text is not None:
320             for chunk_xml in chop(main_text):
321                 empty = False
322                 if sample is not None:
323                     if sample <= 0:
324                         empty = True
325                     else:
326                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
327                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
328
329                 toc.extend(chunk_toc)
330                 chars = chars.union(chunk_chars)
331                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
332                 add_to_manifest(manifest, chunk_counter)
333                 add_to_spine(spine, chunk_counter)
334                 chunk_counter += 1
335
336         if children:
337             for child in children:
338                 child_xml = etree.parse(provider.by_uri(child))
339                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
340                 toc.append(child_toc)
341                 chars = chars.union(chunk_chars)
342
343         return toc, chunk_counter, chars, sample
344
345     # read metadata from the first file
346     if file_path:
347         if slug:
348             raise ValueError('slug or file_path should be specified, not both')
349         f = open(file_path, 'r')
350         input_xml = etree.parse(f)
351         f.close()
352     else:
353         if not slug:
354             raise ValueError('either slug or file_path should be specified')
355         input_xml = etree.parse(provider[slug])
356
357     if flags:
358         for flag in flags:
359             input_xml.getroot().set(flag, 'yes')
360
361     metadata = input_xml.find('.//'+RDFNS('Description'))
362     if metadata is None:
363         raise NoDublinCore('Document has no DublinCore - which is required.')
364     book_info = BookInfo.from_element(input_xml)
365     metadata = etree.ElementTree(metadata)
366
367     # if output to dir, create the file
368     if output_dir is not None:
369         if make_dir:
370             author = unicode(book_info.author)
371             output_dir = os.path.join(output_dir, author)
372             try:
373                 os.makedirs(output_dir)
374             except OSError:
375                 pass
376         if slug:
377             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
378         else:
379             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
380
381     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
382
383     # write static elements
384     mime = zipfile.ZipInfo()
385     mime.filename = 'mimetype'
386     mime.compress_type = zipfile.ZIP_STORED
387     mime.extra = ''
388     zip.writestr(mime, 'application/epub+zip')
389     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
390                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
391                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
392                        'media-type="application/oebps-package+xml" />' \
393                        '</rootfiles></container>')
394     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
395     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
396
397     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
398     manifest = opf.find('.//' + OPFNS('manifest'))
399     spine = opf.find('.//' + OPFNS('spine'))
400
401     if cover:
402         cover_file = StringIO()
403         c = cover(book_info.author.readable(), book_info.title)
404         c.save(cover_file)
405         c_name = 'cover.%s' % c.ext()
406         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
407         del cover_file
408
409         cover_tree = etree.parse(get_resource('epub/cover.html'))
410         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
411         zip.writestr('OPS/cover.html', etree.tostring(
412                         cover_tree, method="html", pretty_print=True))
413
414         manifest.append(etree.fromstring(
415             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
416         manifest.append(etree.fromstring(
417             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
418         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
419         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
420         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
421
422
423     annotations = etree.Element('annotations')
424
425     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
426                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
427                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
428                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
429                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
430                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
431                                '</navPoint></navMap></ncx>')
432     nav_map = toc_file[-1]
433
434     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
435
436     if not toc.children:
437         toc.add(u"Początek utworu", 1)
438     toc_counter = toc.write_to_xml(nav_map, 2)
439
440     # Last modifications in container files and EPUB creation
441     if len(annotations) > 0:
442         nav_map.append(etree.fromstring(
443             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
444             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
445         toc_counter += 1
446         manifest.append(etree.fromstring(
447             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
448         spine.append(etree.fromstring(
449             '<itemref idref="annotations" />'))
450         replace_by_verse(annotations)
451         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
452         chars = chars.union(used_chars(html_tree.getroot()))
453         zip.writestr('OPS/annotations.html', etree.tostring(
454                             html_tree, method="html", pretty_print=True))
455
456     nav_map.append(etree.fromstring(
457         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
458         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
459     manifest.append(etree.fromstring(
460         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
461     spine.append(etree.fromstring(
462         '<itemref idref="last" />'))
463     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
464     chars.update(used_chars(html_tree.getroot()))
465     zip.writestr('OPS/last.html', etree.tostring(
466                         html_tree, method="html", pretty_print=True))
467
468     # strip fonts
469     tmpdir = mkdtemp('-librarian-epub')
470     cwd = os.getcwd()
471
472     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
473     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
474         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
475                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
476         if verbose:
477             print "Running font-optimizer"
478             subprocess.check_call(optimizer_call)
479         else:
480             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
481         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
482     rmtree(tmpdir)
483     os.chdir(cwd)
484
485     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
486     contents = []
487     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
488     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
489     for st in attributes:
490         meta = toc_file.makeelement(NCXNS('meta'))
491         meta.set('name', st)
492         meta.set('content', '0')
493         toc_file[0].append(meta)
494     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
495     toc_file[0][1].set('content', str(toc.depth()))
496     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
497     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
498     zip.close()