[epub, mobi] Fix for a problem with epubs, mobi in drama ebooks without acts
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import re
11 import subprocess
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
15 import zipfile
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
18
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
21
22 from librarian import functions, get_resource
23
24 functions.reg_person_name()
25 functions.reg_lang_code_3to2()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node])
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace(u"\ufeff", u"")\
82                    .replace("---", u"\u2014")\
83                    .replace("--", u"\u2013")\
84                    .replace(",,", u"\u201E")\
85                    .replace('"', u"\u201D")\
86                    .replace("'", u"\u2019")
87     if node.tag in ('uwaga', 'extra'):
88         t = node.tail
89         node.clear()
90         node.tail = t
91     node.text = replace_chars(node.text)
92     node.tail = replace_chars(node.tail)
93     for child in node:
94         replace_characters(child)
95
96
97 def find_annotations(annotations, source, part_no):
98     for child in source:
99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
100             annotation = deepcopy(child)
101             number = str(len(annotations)+1)
102             annotation.set('number', number)
103             annotation.set('part', str(part_no))
104             annotation.tail = ''
105             annotations.append(annotation)
106             tail = child.tail
107             child.clear()
108             child.tail = tail
109             child.text = number
110         if child.tag not in ('extra', 'uwaga'):
111             find_annotations(annotations, child, part_no)
112
113
114 class Stanza(object):
115     """
116     Converts / verse endings into verse elements in a stanza.
117
118     Slashes may only occur directly in the stanza. Any slashes in subelements
119     will be ignored, and the subelements will be put inside verse elements.
120
121     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
122     >>> Stanza(s).versify()
123     >>> print etree.tostring(s)
124     <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
125     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
126     
127     """
128     def __init__(self, stanza_elem):
129         self.stanza = stanza_elem
130         self.verses = []
131         self.open_verse = None
132
133     def versify(self):
134         self.push_text(self.stanza.text)
135         for elem in self.stanza:
136             self.push_elem(elem)
137             self.push_text(elem.tail)
138         tail = self.stanza.tail
139         self.stanza.clear()
140         self.stanza.tail = tail
141         self.stanza.extend(self.verses)
142
143     def open_normal_verse(self):
144         self.open_verse = self.stanza.makeelement("wers_normalny")
145         self.verses.append(self.open_verse)
146
147     def get_open_verse(self):
148         if self.open_verse is None:
149             self.open_normal_verse()
150         return self.open_verse
151
152     def push_text(self, text):
153         if not text:
154             return
155         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
156             if i:
157                 self.open_normal_verse()
158             verse = self.get_open_verse()
159             if len(verse):
160                 verse[-1].tail = (verse[-1].tail or "") + verse_text
161             else:
162                 verse.text = (verse.text or "") + verse_text
163
164     def push_elem(self, elem):
165         if elem.tag.startswith("wers"):
166             verse = deepcopy(elem)
167             verse.tail = None
168             self.verses.append(verse)
169             self.open_verse = verse
170         else:
171             appended = deepcopy(elem)
172             appended.tail = None
173             self.get_open_verse().append(appended)
174
175
176 def replace_by_verse(tree):
177     """ Find stanzas and create new verses in place of a '/' character """
178
179     stanzas = tree.findall('.//' + WLNS('strofa'))
180     for stanza in stanzas:
181         Stanza(stanza).versify()
182
183
184 def add_to_manifest(manifest, partno):
185     """ Adds a node to the manifest section in content.opf file """
186
187     partstr = 'part%d' % partno
188     e = manifest.makeelement(OPFNS('item'), attrib={
189                                  'id': partstr,
190                                  'href': partstr + '.html',
191                                  'media-type': 'application/xhtml+xml',
192                              })
193     manifest.append(e)
194
195
196 def add_to_spine(spine, partno):
197     """ Adds a node to the spine section in content.opf file """
198
199     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
200     spine.append(e)
201
202
203 class TOC(object):
204     def __init__(self, name=None, part_href=None):
205         self.children = []
206         self.name = name
207         self.part_href = part_href
208         self.sub_number = None
209
210     def add(self, name, part_href, level=0, is_part=True, index=None):
211         assert level == 0 or index is None
212         if level > 0 and self.children:
213             return self.children[-1].add(name, part_href, level-1, is_part)
214         else:
215             t = TOC(name)
216             t.part_href = part_href
217             if index is not None:
218                 self.children.insert(index, t)
219             else:
220                 self.children.append(t)
221             if not is_part:
222                 t.sub_number = len(self.children) + 1
223                 return t.sub_number
224
225     def append(self, toc):
226         self.children.append(toc)
227
228     def extend(self, toc):
229         self.children.extend(toc.children)
230
231     def depth(self):
232         if self.children:
233             return max((c.depth() for c in self.children)) + 1
234         else:
235             return 0
236
237     def href(self):
238         src = self.part_href
239         if self.sub_number is not None:
240             src += '#sub%d' % self.sub_number
241         return src
242
243     def write_to_xml(self, nav_map, counter=1):
244         for child in self.children:
245             nav_point = nav_map.makeelement(NCXNS('navPoint'))
246             nav_point.set('id', 'NavPoint-%d' % counter)
247             nav_point.set('playOrder', str(counter))
248
249             nav_label = nav_map.makeelement(NCXNS('navLabel'))
250             text = nav_map.makeelement(NCXNS('text'))
251             text.text = child.name
252             nav_label.append(text)
253             nav_point.append(nav_label)
254
255             content = nav_map.makeelement(NCXNS('content'))
256             content.set('src', child.href())
257             nav_point.append(content)
258             nav_map.append(nav_point)
259             counter = child.write_to_xml(nav_point, counter + 1)
260         return counter
261
262     def html_part(self, depth=0):
263         texts = []
264         for child in self.children:
265             texts.append(
266                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
267                 (depth, child.href(), child.name))
268             texts.append(child.html_part(depth+1))
269         return "\n".join(texts)
270
271     def html(self):
272         with open(get_resource('epub/toc.html')) as f:
273             t = unicode(f.read(), 'utf-8')
274         return t % self.html_part()
275
276
277 def used_chars(element):
278     """ Lists characters used in an ETree Element """
279     chars = set((element.text or '') + (element.tail or ''))
280     for child in element:
281         chars = chars.union(used_chars(child))
282     return chars
283
284
285 def chop(main_text):
286     """ divide main content of the XML file into chunks """
287
288     # prepare a container for each chunk
289     part_xml = etree.Element('utwor')
290     etree.SubElement(part_xml, 'master')
291     main_xml_part = part_xml[0] # master
292
293     last_node_part = False
294     
295     # the below loops are workaround for a problem with epubs in drama ebooks without acts
296     is_scene = False
297     for one_part in main_text:
298         name = one_part.tag
299         if name in ('naglowek_scena'):
300             is_scene = True
301             break
302     if is_scene is True:
303         is_scene_with_acts = False
304         for one_part in main_text:
305             if one_part.tag == 'naglowek_akt':
306                 is_scene_with_acts = True
307                 break
308     else:
309         is_scene_with_acts = False
310     
311     for one_part in main_text:
312         name = one_part.tag
313         if is_scene_with_acts is False and is_scene is True:
314             if name == 'naglowek_czesc':
315                 yield part_xml
316                 last_node_part = True
317                 main_xml_part[:] = [deepcopy(one_part)]
318             elif not last_node_part and name in ("naglowek_scena"):
319                 yield part_xml
320                 main_xml_part[:] = [deepcopy(one_part)]
321             else:
322                 main_xml_part.append(deepcopy(one_part))
323                 last_node_part = False
324         else:
325             if name == 'naglowek_czesc':
326                 yield part_xml
327                 last_node_part = True
328                 main_xml_part[:] = [deepcopy(one_part)]
329             elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
330                 yield part_xml
331                 main_xml_part[:] = [deepcopy(one_part)]
332             else:
333                 main_xml_part.append(deepcopy(one_part))
334                 last_node_part = False            
335     yield part_xml
336
337
338 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
339     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
340
341     toc = TOC()
342     for element in chunk_xml[0]:
343         if element.tag in ("naglowek_czesc"):
344             toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
345         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
346             toc.add(node_name(element), "part%d.html" % chunk_no)
347         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
348             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
349             element.set('sub', str(subnumber))
350     if empty:
351         if not _empty_html_static:
352             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
353         chars = set()
354         output_html = _empty_html_static[0]
355     else:
356         find_annotations(annotations, chunk_xml, chunk_no)
357         replace_by_verse(chunk_xml)
358         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
359         chars = used_chars(html_tree.getroot())
360         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
361     return output_html, toc, chars
362
363
364 def transform(wldoc, verbose=False,
365               style=None, html_toc=False,
366               sample=None, cover=None, flags=None):
367     """ produces a EPUB file
368
369     sample=n: generate sample e-book (with at least n paragraphs)
370     cover: a cover.Cover factory or True for default
371     flags: less-advertising, without-fonts, working-copy, with-full-fonts
372     """
373
374     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
375         """ processes one input file and proceeds to its children """
376
377         replace_characters(wldoc.edoc.getroot())
378
379         # every input file will have a TOC entry,
380         # pointing to starting chunk
381         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
382         chars = set()
383         if first:
384             # write book title page
385             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
386             chars = used_chars(html_tree.getroot())
387             zip.writestr('OPS/title.html',
388                  etree.tostring(html_tree, method="html", pretty_print=True))
389             # add a title page TOC entry
390             toc.add(u"Strona tytułowa", "title.html")
391         elif wldoc.book_info.parts:
392             # write title page for every parent
393             if sample is not None and sample <= 0:
394                 chars = set()
395                 html_string = open(get_resource('epub/emptyChunk.html')).read()
396             else:
397                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
398                 chars = used_chars(html_tree.getroot())
399                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
400             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
401             add_to_manifest(manifest, chunk_counter)
402             add_to_spine(spine, chunk_counter)
403             chunk_counter += 1
404
405         if len(wldoc.edoc.getroot()) > 1:
406             # rdf before style master
407             main_text = wldoc.edoc.getroot()[1]
408         else:
409             # rdf in style master
410             main_text = wldoc.edoc.getroot()[0]
411             if main_text.tag == RDFNS('RDF'):
412                 main_text = None
413
414         if main_text is not None:
415             for chunk_xml in chop(main_text):
416                 empty = False
417                 if sample is not None:
418                     if sample <= 0:
419                         empty = True
420                     else:
421                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
422                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
423
424                 toc.extend(chunk_toc)
425                 chars = chars.union(chunk_chars)
426                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
427                 add_to_manifest(manifest, chunk_counter)
428                 add_to_spine(spine, chunk_counter)
429                 chunk_counter += 1
430
431         for child in wldoc.parts():
432             child_toc, chunk_counter, chunk_chars, sample = transform_file(
433                 child, chunk_counter, first=False, sample=sample)
434             toc.append(child_toc)
435             chars = chars.union(chunk_chars)
436
437         return toc, chunk_counter, chars, sample
438
439
440     document = deepcopy(wldoc)
441     del wldoc
442
443     if flags:
444         for flag in flags:
445             document.edoc.getroot().set(flag, 'yes')
446
447     # add editors info
448     document.edoc.getroot().set('editors', u', '.join(sorted(
449         editor.readable() for editor in document.editors())))
450     if document.book_info.funders:
451         document.edoc.getroot().set('funders', u', '.join(
452             document.book_info.funders))
453     if document.book_info.thanks:
454         document.edoc.getroot().set('thanks', document.book_info.thanks)
455
456     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
457     manifest = opf.find('.//' + OPFNS('manifest'))
458     guide = opf.find('.//' + OPFNS('guide'))
459     spine = opf.find('.//' + OPFNS('spine'))
460
461     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
462     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
463
464     # write static elements
465     mime = zipfile.ZipInfo()
466     mime.filename = 'mimetype'
467     mime.compress_type = zipfile.ZIP_STORED
468     mime.extra = ''
469     zip.writestr(mime, 'application/epub+zip')
470     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
471                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
472                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
473                        'media-type="application/oebps-package+xml" />' \
474                        '</rootfiles></container>')
475     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
476     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
477     if not style:
478         style = get_resource('epub/style.css')
479     zip.write(style, os.path.join('OPS', 'style.css'))
480
481     if cover:
482         if cover is True:
483             cover = DefaultEbookCover
484
485         cover_file = StringIO()
486         bound_cover = cover(document.book_info)
487         bound_cover.save(cover_file)
488         cover_name = 'cover.%s' % bound_cover.ext()
489         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
490         del cover_file
491
492         cover_tree = etree.parse(get_resource('epub/cover.html'))
493         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
494         zip.writestr('OPS/cover.html', etree.tostring(
495                         cover_tree, method="html", pretty_print=True))
496
497         if bound_cover.uses_dc_cover:
498             if document.book_info.cover_by:
499                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
500             if document.book_info.cover_source:
501                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
502
503         manifest.append(etree.fromstring(
504             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
505         manifest.append(etree.fromstring(
506             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
507         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
508         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
509         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
510
511
512     annotations = etree.Element('annotations')
513
514     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
515                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
516                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
517                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
518                                '</navMap></ncx>')
519     nav_map = toc_file[-1]
520
521     if html_toc:
522         manifest.append(etree.fromstring(
523             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
524         spine.append(etree.fromstring(
525             '<itemref idref="html_toc" />'))
526         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
527
528     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
529
530     if len(toc.children) < 2:
531         toc.add(u"Początek utworu", "part1.html")
532
533     # Last modifications in container files and EPUB creation
534     if len(annotations) > 0:
535         toc.add("Przypisy", "annotations.html")
536         manifest.append(etree.fromstring(
537             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
538         spine.append(etree.fromstring(
539             '<itemref idref="annotations" />'))
540         replace_by_verse(annotations)
541         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
542         chars = chars.union(used_chars(html_tree.getroot()))
543         zip.writestr('OPS/annotations.html', etree.tostring(
544                             html_tree, method="html", pretty_print=True))
545
546     toc.add("Wesprzyj Wolne Lektury", "support.html")
547     manifest.append(etree.fromstring(
548         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
549     spine.append(etree.fromstring(
550         '<itemref idref="support" />'))
551     html_string = open(get_resource('epub/support.html')).read()
552     chars.update(used_chars(etree.fromstring(html_string)))
553     zip.writestr('OPS/support.html', html_string)
554
555     toc.add("Strona redakcyjna", "last.html")
556     manifest.append(etree.fromstring(
557         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
558     spine.append(etree.fromstring(
559         '<itemref idref="last" />'))
560     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
561     chars.update(used_chars(html_tree.getroot()))
562     zip.writestr('OPS/last.html', etree.tostring(
563                         html_tree, method="html", pretty_print=True))
564
565     if not flags or not 'without-fonts' in flags:
566         # strip fonts
567         tmpdir = mkdtemp('-librarian-epub')
568         try:
569             cwd = os.getcwd()
570         except OSError:
571             cwd = None
572
573         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
574         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
575             if not flags or not 'with-full-fonts' in flags:
576                 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
577                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]              
578                 if verbose:
579                     print "Running font-optimizer"
580                     subprocess.check_call(optimizer_call)
581                 else:
582                     subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
583                     zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
584             else:
585                 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
586             manifest.append(etree.fromstring(
587                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
588         rmtree(tmpdir)
589         if cwd is not None:
590             os.chdir(cwd)
591     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
592     title = document.book_info.title
593     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
594     for st in attributes:
595         meta = toc_file.makeelement(NCXNS('meta'))
596         meta.set('name', st)
597         meta.set('content', '0')
598         toc_file[0].append(meta)
599     toc_file[0][0].set('content', str(document.book_info.url))
600     toc_file[0][1].set('content', str(toc.depth()))
601     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
602
603     # write TOC
604     if html_toc:
605         toc.add(u"Spis treści", "toc.html", index=1)
606         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
607     toc.write_to_xml(nav_map)
608     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
609     zip.close()
610
611     return OutputFile.from_filename(output_file.name)