[epub, mobi] fix for hanging single letter conjunctions - currenty disabled
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import re
11 import subprocess
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
15 import zipfile
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
18
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
21
22 from librarian import functions, get_resource
23
24 functions.reg_person_name()
25 functions.reg_lang_code_3to2()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node])
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         #text = re.sub(r"(?<=\s\w)\s+", u"\u00a0", text) #fix for hanging single letter conjunctions – for future use.
82         return text.replace(u"\ufeff", u"")\
83                    .replace("---", u"\u2014")\
84                    .replace("--", u"\u2013")\
85                    .replace(",,", u"\u201E")\
86                    .replace('"', u"\u201D")\
87                    .replace("'", u"\u2019")
88     if node.tag in ('uwaga', 'extra'):
89         t = node.tail
90         node.clear()
91         node.tail = t
92     node.text = replace_chars(node.text)
93     node.tail = replace_chars(node.tail)
94     for child in node:
95         replace_characters(child)
96
97
98 def find_annotations(annotations, source, part_no):
99     for child in source:
100         if child.tag in ('pe', 'pa', 'pt', 'pr'):
101             annotation = deepcopy(child)
102             number = str(len(annotations)+1)
103             annotation.set('number', number)
104             annotation.set('part', str(part_no))
105             annotation.tail = ''
106             annotations.append(annotation)
107             tail = child.tail
108             child.clear()
109             child.tail = tail
110             child.text = number
111         if child.tag not in ('extra', 'uwaga'):
112             find_annotations(annotations, child, part_no)
113
114
115 class Stanza(object):
116     """
117     Converts / verse endings into verse elements in a stanza.
118
119     Slashes may only occur directly in the stanza. Any slashes in subelements
120     will be ignored, and the subelements will be put inside verse elements.
121
122     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
123     >>> Stanza(s).versify()
124     >>> print etree.tostring(s)
125     <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
126     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
127     
128     """
129     def __init__(self, stanza_elem):
130         self.stanza = stanza_elem
131         self.verses = []
132         self.open_verse = None
133
134     def versify(self):
135         self.push_text(self.stanza.text)
136         for elem in self.stanza:
137             self.push_elem(elem)
138             self.push_text(elem.tail)
139         tail = self.stanza.tail
140         self.stanza.clear()
141         self.stanza.tail = tail
142         self.stanza.extend(self.verses)
143
144     def open_normal_verse(self):
145         self.open_verse = self.stanza.makeelement("wers_normalny")
146         self.verses.append(self.open_verse)
147
148     def get_open_verse(self):
149         if self.open_verse is None:
150             self.open_normal_verse()
151         return self.open_verse
152
153     def push_text(self, text):
154         if not text:
155             return
156         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
157             if i:
158                 self.open_normal_verse()
159             verse = self.get_open_verse()
160             if len(verse):
161                 verse[-1].tail = (verse[-1].tail or "") + verse_text
162             else:
163                 verse.text = (verse.text or "") + verse_text
164
165     def push_elem(self, elem):
166         if elem.tag.startswith("wers"):
167             verse = deepcopy(elem)
168             verse.tail = None
169             self.verses.append(verse)
170             self.open_verse = verse
171         else:
172             appended = deepcopy(elem)
173             appended.tail = None
174             self.get_open_verse().append(appended)
175
176
177 def replace_by_verse(tree):
178     """ Find stanzas and create new verses in place of a '/' character """
179
180     stanzas = tree.findall('.//' + WLNS('strofa'))
181     for stanza in stanzas:
182         Stanza(stanza).versify()
183
184
185 def add_to_manifest(manifest, partno):
186     """ Adds a node to the manifest section in content.opf file """
187
188     partstr = 'part%d' % partno
189     e = manifest.makeelement(OPFNS('item'), attrib={
190                                  'id': partstr,
191                                  'href': partstr + '.html',
192                                  'media-type': 'application/xhtml+xml',
193                              })
194     manifest.append(e)
195
196
197 def add_to_spine(spine, partno):
198     """ Adds a node to the spine section in content.opf file """
199
200     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
201     spine.append(e)
202
203
204 class TOC(object):
205     def __init__(self, name=None, part_href=None):
206         self.children = []
207         self.name = name
208         self.part_href = part_href
209         self.sub_number = None
210
211     def add(self, name, part_href, level=0, is_part=True, index=None):
212         assert level == 0 or index is None
213         if level > 0 and self.children:
214             return self.children[-1].add(name, part_href, level-1, is_part)
215         else:
216             t = TOC(name)
217             t.part_href = part_href
218             if index is not None:
219                 self.children.insert(index, t)
220             else:
221                 self.children.append(t)
222             if not is_part:
223                 t.sub_number = len(self.children) + 1
224                 return t.sub_number
225
226     def append(self, toc):
227         self.children.append(toc)
228
229     def extend(self, toc):
230         self.children.extend(toc.children)
231
232     def depth(self):
233         if self.children:
234             return max((c.depth() for c in self.children)) + 1
235         else:
236             return 0
237
238     def href(self):
239         src = self.part_href
240         if self.sub_number is not None:
241             src += '#sub%d' % self.sub_number
242         return src
243
244     def write_to_xml(self, nav_map, counter=1):
245         for child in self.children:
246             nav_point = nav_map.makeelement(NCXNS('navPoint'))
247             nav_point.set('id', 'NavPoint-%d' % counter)
248             nav_point.set('playOrder', str(counter))
249
250             nav_label = nav_map.makeelement(NCXNS('navLabel'))
251             text = nav_map.makeelement(NCXNS('text'))
252             text.text = child.name
253             nav_label.append(text)
254             nav_point.append(nav_label)
255
256             content = nav_map.makeelement(NCXNS('content'))
257             content.set('src', child.href())
258             nav_point.append(content)
259             nav_map.append(nav_point)
260             counter = child.write_to_xml(nav_point, counter + 1)
261         return counter
262
263     def html_part(self, depth=0):
264         texts = []
265         for child in self.children:
266             texts.append(
267                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
268                 (depth, child.href(), child.name))
269             texts.append(child.html_part(depth+1))
270         return "\n".join(texts)
271
272     def html(self):
273         with open(get_resource('epub/toc.html')) as f:
274             t = unicode(f.read(), 'utf-8')
275         return t % self.html_part()
276
277
278 def used_chars(element):
279     """ Lists characters used in an ETree Element """
280     chars = set((element.text or '') + (element.tail or ''))
281     for child in element:
282         chars = chars.union(used_chars(child))
283     return chars
284
285
286 def chop(main_text):
287     """ divide main content of the XML file into chunks """
288
289     # prepare a container for each chunk
290     part_xml = etree.Element('utwor')
291     etree.SubElement(part_xml, 'master')
292     main_xml_part = part_xml[0] # master
293
294     last_node_part = False
295     
296     # the below loop are workaround for a problem with epubs in drama ebooks without acts
297     is_scene = False
298     is_act = False
299     for one_part in main_text:
300         name = one_part.tag
301         if name == 'naglowek_scena':
302             is_scene = True
303         elif name == 'naglowek_akt':
304             is_act = True
305     
306     for one_part in main_text:
307         name = one_part.tag
308         if is_act is False and is_scene is True:
309             if name == 'naglowek_czesc':
310                 yield part_xml
311                 last_node_part = True
312                 main_xml_part[:] = [deepcopy(one_part)]
313             elif not last_node_part and name in ("naglowek_scena"):
314                 yield part_xml
315                 main_xml_part[:] = [deepcopy(one_part)]
316             else:
317                 main_xml_part.append(deepcopy(one_part))
318                 last_node_part = False
319         else:
320             if name == 'naglowek_czesc':
321                 yield part_xml
322                 last_node_part = True
323                 main_xml_part[:] = [deepcopy(one_part)]
324             elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
325                 yield part_xml
326                 main_xml_part[:] = [deepcopy(one_part)]
327             else:
328                 main_xml_part.append(deepcopy(one_part))
329                 last_node_part = False            
330     yield part_xml
331
332
333 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
334     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
335
336     toc = TOC()
337     for element in chunk_xml[0]:
338         if element.tag in ("naglowek_czesc"):
339             toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
340         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
341             toc.add(node_name(element), "part%d.html" % chunk_no)
342         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
343             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
344             element.set('sub', str(subnumber))
345     if empty:
346         if not _empty_html_static:
347             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
348         chars = set()
349         output_html = _empty_html_static[0]
350     else:
351         find_annotations(annotations, chunk_xml, chunk_no)
352         replace_by_verse(chunk_xml)
353         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
354         chars = used_chars(html_tree.getroot())
355         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
356     return output_html, toc, chars
357
358
359 def transform(wldoc, verbose=False,
360               style=None, html_toc=False,
361               sample=None, cover=None, flags=None):
362     """ produces a EPUB file
363
364     sample=n: generate sample e-book (with at least n paragraphs)
365     cover: a cover.Cover factory or True for default
366     flags: less-advertising, without-fonts, working-copy, with-full-fonts
367     """
368
369     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
370         """ processes one input file and proceeds to its children """
371
372         replace_characters(wldoc.edoc.getroot())
373
374         # every input file will have a TOC entry,
375         # pointing to starting chunk
376         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
377         chars = set()
378         if first:
379             # write book title page
380             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
381             chars = used_chars(html_tree.getroot())
382             zip.writestr('OPS/title.html',
383                  etree.tostring(html_tree, method="html", pretty_print=True))
384             # add a title page TOC entry
385             toc.add(u"Strona tytułowa", "title.html")
386         elif wldoc.book_info.parts:
387             # write title page for every parent
388             if sample is not None and sample <= 0:
389                 chars = set()
390                 html_string = open(get_resource('epub/emptyChunk.html')).read()
391             else:
392                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
393                 chars = used_chars(html_tree.getroot())
394                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
395             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
396             add_to_manifest(manifest, chunk_counter)
397             add_to_spine(spine, chunk_counter)
398             chunk_counter += 1
399
400         if len(wldoc.edoc.getroot()) > 1:
401             # rdf before style master
402             main_text = wldoc.edoc.getroot()[1]
403         else:
404             # rdf in style master
405             main_text = wldoc.edoc.getroot()[0]
406             if main_text.tag == RDFNS('RDF'):
407                 main_text = None
408
409         if main_text is not None:
410             for chunk_xml in chop(main_text):
411                 empty = False
412                 if sample is not None:
413                     if sample <= 0:
414                         empty = True
415                     else:
416                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
417                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
418
419                 toc.extend(chunk_toc)
420                 chars = chars.union(chunk_chars)
421                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
422                 add_to_manifest(manifest, chunk_counter)
423                 add_to_spine(spine, chunk_counter)
424                 chunk_counter += 1
425
426         for child in wldoc.parts():
427             child_toc, chunk_counter, chunk_chars, sample = transform_file(
428                 child, chunk_counter, first=False, sample=sample)
429             toc.append(child_toc)
430             chars = chars.union(chunk_chars)
431
432         return toc, chunk_counter, chars, sample
433
434
435     document = deepcopy(wldoc)
436     del wldoc
437
438     if flags:
439         for flag in flags:
440             document.edoc.getroot().set(flag, 'yes')
441
442     # add editors info
443     document.edoc.getroot().set('editors', u', '.join(sorted(
444         editor.readable() for editor in document.editors())))
445     if document.book_info.funders:
446         document.edoc.getroot().set('funders', u', '.join(
447             document.book_info.funders))
448     if document.book_info.thanks:
449         document.edoc.getroot().set('thanks', document.book_info.thanks)
450
451     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
452     manifest = opf.find('.//' + OPFNS('manifest'))
453     guide = opf.find('.//' + OPFNS('guide'))
454     spine = opf.find('.//' + OPFNS('spine'))
455
456     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
457     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
458
459     # write static elements
460     mime = zipfile.ZipInfo()
461     mime.filename = 'mimetype'
462     mime.compress_type = zipfile.ZIP_STORED
463     mime.extra = ''
464     zip.writestr(mime, 'application/epub+zip')
465     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
466                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
467                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
468                        'media-type="application/oebps-package+xml" />' \
469                        '</rootfiles></container>')
470     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
471     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
472     if not style:
473         style = get_resource('epub/style.css')
474     zip.write(style, os.path.join('OPS', 'style.css'))
475
476     if cover:
477         if cover is True:
478             cover = DefaultEbookCover
479
480         cover_file = StringIO()
481         bound_cover = cover(document.book_info)
482         bound_cover.save(cover_file)
483         cover_name = 'cover.%s' % bound_cover.ext()
484         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
485         del cover_file
486
487         cover_tree = etree.parse(get_resource('epub/cover.html'))
488         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
489         zip.writestr('OPS/cover.html', etree.tostring(
490                         cover_tree, method="html", pretty_print=True))
491
492         if bound_cover.uses_dc_cover:
493             if document.book_info.cover_by:
494                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
495             if document.book_info.cover_source:
496                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
497
498         manifest.append(etree.fromstring(
499             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
500         manifest.append(etree.fromstring(
501             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
502         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
503         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
504         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
505
506
507     annotations = etree.Element('annotations')
508
509     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
510                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
511                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
512                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
513                                '</navMap></ncx>')
514     nav_map = toc_file[-1]
515
516     if html_toc:
517         manifest.append(etree.fromstring(
518             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
519         spine.append(etree.fromstring(
520             '<itemref idref="html_toc" />'))
521         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
522
523     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
524
525     if len(toc.children) < 2:
526         toc.add(u"Początek utworu", "part1.html")
527
528     # Last modifications in container files and EPUB creation
529     if len(annotations) > 0:
530         toc.add("Przypisy", "annotations.html")
531         manifest.append(etree.fromstring(
532             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
533         spine.append(etree.fromstring(
534             '<itemref idref="annotations" />'))
535         replace_by_verse(annotations)
536         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
537         chars = chars.union(used_chars(html_tree.getroot()))
538         zip.writestr('OPS/annotations.html', etree.tostring(
539                             html_tree, method="html", pretty_print=True))
540
541     toc.add("Wesprzyj Wolne Lektury", "support.html")
542     manifest.append(etree.fromstring(
543         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
544     spine.append(etree.fromstring(
545         '<itemref idref="support" />'))
546     html_string = open(get_resource('epub/support.html')).read()
547     chars.update(used_chars(etree.fromstring(html_string)))
548     zip.writestr('OPS/support.html', html_string)
549
550     toc.add("Strona redakcyjna", "last.html")
551     manifest.append(etree.fromstring(
552         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
553     spine.append(etree.fromstring(
554         '<itemref idref="last" />'))
555     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
556     chars.update(used_chars(html_tree.getroot()))
557     zip.writestr('OPS/last.html', etree.tostring(
558                         html_tree, method="html", pretty_print=True))
559
560     if not flags or not 'without-fonts' in flags:
561         # strip fonts
562         tmpdir = mkdtemp('-librarian-epub')
563         try:
564             cwd = os.getcwd()
565         except OSError:
566             cwd = None
567
568         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
569         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
570             if not flags or not 'with-full-fonts' in flags:
571                 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
572                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]              
573                 if verbose:
574                     print "Running font-optimizer"
575                     subprocess.check_call(optimizer_call)
576                 else:
577                     subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
578                     zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
579             else:
580                 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
581             manifest.append(etree.fromstring(
582                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
583         rmtree(tmpdir)
584         if cwd is not None:
585             os.chdir(cwd)
586     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
587     title = document.book_info.title
588     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
589     for st in attributes:
590         meta = toc_file.makeelement(NCXNS('meta'))
591         meta.set('name', st)
592         meta.set('content', '0')
593         toc_file[0].append(meta)
594     toc_file[0][0].set('content', str(document.book_info.url))
595     toc_file[0][1].set('content', str(toc.depth()))
596     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
597
598     # write TOC
599     if html_toc:
600         toc.add(u"Spis treści", "toc.html", index=1)
601         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
602     toc.write_to_xml(nav_map)
603     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
604     zip.close()
605
606     return OutputFile.from_filename(output_file.name)