Fix HTML test.
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import re
11 import subprocess
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
15 import zipfile
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
18
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
21
22 from librarian import functions, get_resource
23
24 functions.reg_person_name()
25
26
27 def inner_xml(node):
28     """ returns node's text and children as a string
29
30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
31     x<b>y</b>z
32     """
33
34     nt = node.text if node.text is not None else ''
35     return ''.join([nt] + [etree.tostring(child) for child in node])
36
37 def set_inner_xml(node, text):
38     """ sets node's text and children from a string
39
40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41     >>> set_inner_xml(e, 'x<b>y</b>z')
42     >>> print etree.tostring(e)
43     <a>x<b>y</b>z</a>
44     """
45
46     p = etree.fromstring('<x>%s</x>' % text)
47     node.text = p.text
48     node[:] = p[:]
49
50
51 def node_name(node):
52     """ Find out a node's name
53
54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
55     XYZ
56     """
57
58     tempnode = deepcopy(node)
59
60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61         for e in tempnode.findall('.//%s' % p):
62             t = e.tail
63             e.clear()
64             e.tail = t
65     etree.strip_tags(tempnode, '*')
66     return tempnode.text
67
68
69 def xslt(xml, sheet):
70     if isinstance(xml, etree._Element):
71         xml = etree.ElementTree(xml)
72     with open(sheet) as xsltf:
73         return xml.xslt(etree.parse(xsltf))
74
75
76 def replace_characters(node):
77     def replace_chars(text):
78         if text is None:
79             return None
80         return text.replace(u"\ufeff", u"")\
81                    .replace("---", u"\u2014")\
82                    .replace("--", u"\u2013")\
83                    .replace(",,", u"\u201E")\
84                    .replace('"', u"\u201D")\
85                    .replace("'", u"\u2019")
86     if node.tag in ('uwaga', 'extra'):
87         t = node.tail
88         node.clear()
89         node.tail = t
90     node.text = replace_chars(node.text)
91     node.tail = replace_chars(node.tail)
92     for child in node:
93         replace_characters(child)
94
95
96 def find_annotations(annotations, source, part_no):
97     for child in source:
98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
99             annotation = deepcopy(child)
100             number = str(len(annotations)+1)
101             annotation.set('number', number)
102             annotation.set('part', str(part_no))
103             annotation.tail = ''
104             annotations.append(annotation)
105             tail = child.tail
106             child.clear()
107             child.tail = tail
108             child.text = number
109         if child.tag not in ('extra', 'uwaga'):
110             find_annotations(annotations, child, part_no)
111
112
113 class Stanza(object):
114     """
115     Converts / verse endings into verse elements in a stanza.
116
117     Slashes may only occur directly in the stanza. Any slashes in subelements
118     will be ignored, and the subelements will be put inside verse elements.
119
120     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
121     >>> Stanza(s).versify()
122     >>> print etree.tostring(s)
123     <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
124     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
125     
126     """
127     def __init__(self, stanza_elem):
128         self.stanza = stanza_elem
129         self.verses = []
130         self.open_verse = None
131
132     def versify(self):
133         self.push_text(self.stanza.text)
134         for elem in self.stanza:
135             self.push_elem(elem)
136             self.push_text(elem.tail)
137         tail = self.stanza.tail
138         self.stanza.clear()
139         self.stanza.tail = tail
140         self.stanza.extend(self.verses)
141
142     def open_normal_verse(self):
143         self.open_verse = self.stanza.makeelement("wers_normalny")
144         self.verses.append(self.open_verse)
145
146     def get_open_verse(self):
147         if self.open_verse is None:
148             self.open_normal_verse()
149         return self.open_verse
150
151     def push_text(self, text):
152         if not text:
153             return
154         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
155             if i:
156                 self.open_normal_verse()
157             verse = self.get_open_verse()
158             if len(verse):
159                 verse[-1].tail = (verse[-1].tail or "") + verse_text
160             else:
161                 verse.text = (verse.text or "") + verse_text
162
163     def push_elem(self, elem):
164         if elem.tag.startswith("wers"):
165             verse = deepcopy(elem)
166             verse.tail = None
167             self.verses.append(verse)
168             self.open_verse = verse
169         else:
170             appended = deepcopy(elem)
171             appended.tail = None
172             self.get_open_verse().append(appended)
173
174
175 def replace_by_verse(tree):
176     """ Find stanzas and create new verses in place of a '/' character """
177
178     stanzas = tree.findall('.//' + WLNS('strofa'))
179     for stanza in stanzas:
180         Stanza(stanza).versify()
181
182
183 def add_to_manifest(manifest, partno):
184     """ Adds a node to the manifest section in content.opf file """
185
186     partstr = 'part%d' % partno
187     e = manifest.makeelement(OPFNS('item'), attrib={
188                                  'id': partstr,
189                                  'href': partstr + '.html',
190                                  'media-type': 'application/xhtml+xml',
191                              })
192     manifest.append(e)
193
194
195 def add_to_spine(spine, partno):
196     """ Adds a node to the spine section in content.opf file """
197
198     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
199     spine.append(e)
200
201
202 class TOC(object):
203     def __init__(self, name=None, part_href=None):
204         self.children = []
205         self.name = name
206         self.part_href = part_href
207         self.sub_number = None
208
209     def add(self, name, part_href, level=0, is_part=True, index=None):
210         assert level == 0 or index is None
211         if level > 0 and self.children:
212             return self.children[-1].add(name, part_href, level-1, is_part)
213         else:
214             t = TOC(name)
215             t.part_href = part_href
216             if index is not None:
217                 self.children.insert(index, t)
218             else:
219                 self.children.append(t)
220             if not is_part:
221                 t.sub_number = len(self.children) + 1
222                 return t.sub_number
223
224     def append(self, toc):
225         self.children.append(toc)
226
227     def extend(self, toc):
228         self.children.extend(toc.children)
229
230     def depth(self):
231         if self.children:
232             return max((c.depth() for c in self.children)) + 1
233         else:
234             return 0
235
236     def href(self):
237         src = self.part_href
238         if self.sub_number is not None:
239             src += '#sub%d' % self.sub_number
240         return src
241
242     def write_to_xml(self, nav_map, counter=1):
243         for child in self.children:
244             nav_point = nav_map.makeelement(NCXNS('navPoint'))
245             nav_point.set('id', 'NavPoint-%d' % counter)
246             nav_point.set('playOrder', str(counter))
247
248             nav_label = nav_map.makeelement(NCXNS('navLabel'))
249             text = nav_map.makeelement(NCXNS('text'))
250             text.text = child.name
251             nav_label.append(text)
252             nav_point.append(nav_label)
253
254             content = nav_map.makeelement(NCXNS('content'))
255             content.set('src', child.href())
256             nav_point.append(content)
257             nav_map.append(nav_point)
258             counter = child.write_to_xml(nav_point, counter + 1)
259         return counter
260
261     def html_part(self, depth=0):
262         texts = []
263         for child in self.children:
264             texts.append(
265                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
266                 (depth, child.href(), child.name))
267             texts.append(child.html_part(depth+1))
268         return "\n".join(texts)
269
270     def html(self):
271         with open(get_resource('epub/toc.html')) as f:
272             t = unicode(f.read(), 'utf-8')
273         return t % self.html_part()
274
275
276 def used_chars(element):
277     """ Lists characters used in an ETree Element """
278     chars = set((element.text or '') + (element.tail or ''))
279     for child in element:
280         chars = chars.union(used_chars(child))
281     return chars
282
283
284 def chop(main_text):
285     """ divide main content of the XML file into chunks """
286
287     # prepare a container for each chunk
288     part_xml = etree.Element('utwor')
289     etree.SubElement(part_xml, 'master')
290     main_xml_part = part_xml[0] # master
291
292     last_node_part = False
293     for one_part in main_text:
294         name = one_part.tag
295         if name == 'naglowek_czesc':
296             yield part_xml
297             last_node_part = True
298             main_xml_part[:] = [deepcopy(one_part)]
299         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
300             yield part_xml
301             main_xml_part[:] = [deepcopy(one_part)]
302         else:
303             main_xml_part.append(deepcopy(one_part))
304             last_node_part = False
305     yield part_xml
306
307
308 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
309     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
310
311     toc = TOC()
312     for element in chunk_xml[0]:
313         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
314             toc.add(node_name(element), "part%d.html" % chunk_no)
315         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
316             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
317             element.set('sub', str(subnumber))
318     if empty:
319         if not _empty_html_static:
320             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
321         chars = set()
322         output_html = _empty_html_static[0]
323     else:
324         find_annotations(annotations, chunk_xml, chunk_no)
325         replace_by_verse(chunk_xml)
326         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
327         chars = used_chars(html_tree.getroot())
328         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
329     return output_html, toc, chars
330
331
332 def transform(wldoc, verbose=False,
333               style=None, html_toc=False,
334               sample=None, cover=None, flags=None):
335     """ produces a EPUB file
336
337     sample=n: generate sample e-book (with at least n paragraphs)
338     cover: a cover.Cover factory or True for default
339     flags: less-advertising, without-fonts, working-copy
340     """
341
342     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
343         """ processes one input file and proceeds to its children """
344
345         replace_characters(wldoc.edoc.getroot())
346
347         # every input file will have a TOC entry,
348         # pointing to starting chunk
349         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
350         chars = set()
351         if first:
352             # write book title page
353             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
354             chars = used_chars(html_tree.getroot())
355             zip.writestr('OPS/title.html',
356                  etree.tostring(html_tree, method="html", pretty_print=True))
357             # add a title page TOC entry
358             toc.add(u"Strona tytułowa", "title.html")
359         elif wldoc.book_info.parts:
360             # write title page for every parent
361             if sample is not None and sample <= 0:
362                 chars = set()
363                 html_string = open(get_resource('epub/emptyChunk.html')).read()
364             else:
365                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
366                 chars = used_chars(html_tree.getroot())
367                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
368             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
369             add_to_manifest(manifest, chunk_counter)
370             add_to_spine(spine, chunk_counter)
371             chunk_counter += 1
372
373         if len(wldoc.edoc.getroot()) > 1:
374             # rdf before style master
375             main_text = wldoc.edoc.getroot()[1]
376         else:
377             # rdf in style master
378             main_text = wldoc.edoc.getroot()[0]
379             if main_text.tag == RDFNS('RDF'):
380                 main_text = None
381
382         if main_text is not None:
383             for chunk_xml in chop(main_text):
384                 empty = False
385                 if sample is not None:
386                     if sample <= 0:
387                         empty = True
388                     else:
389                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
390                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
391
392                 toc.extend(chunk_toc)
393                 chars = chars.union(chunk_chars)
394                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
395                 add_to_manifest(manifest, chunk_counter)
396                 add_to_spine(spine, chunk_counter)
397                 chunk_counter += 1
398
399         for child in wldoc.parts():
400             child_toc, chunk_counter, chunk_chars, sample = transform_file(
401                 child, chunk_counter, first=False, sample=sample)
402             toc.append(child_toc)
403             chars = chars.union(chunk_chars)
404
405         return toc, chunk_counter, chars, sample
406
407
408     document = deepcopy(wldoc)
409     del wldoc
410
411     if flags:
412         for flag in flags:
413             document.edoc.getroot().set(flag, 'yes')
414
415     # add editors info
416     document.edoc.getroot().set('editors', u', '.join(sorted(
417         editor.readable() for editor in document.editors())))
418     if document.book_info.funders:
419         document.edoc.getroot().set('funders', u', '.join(
420             document.book_info.funders))
421     if document.book_info.thanks:
422         document.edoc.getroot().set('thanks', document.book_info.thanks)
423
424     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
425     manifest = opf.find('.//' + OPFNS('manifest'))
426     guide = opf.find('.//' + OPFNS('guide'))
427     spine = opf.find('.//' + OPFNS('spine'))
428
429     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
430     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
431
432     # write static elements
433     mime = zipfile.ZipInfo()
434     mime.filename = 'mimetype'
435     mime.compress_type = zipfile.ZIP_STORED
436     mime.extra = ''
437     zip.writestr(mime, 'application/epub+zip')
438     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
439                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
440                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
441                        'media-type="application/oebps-package+xml" />' \
442                        '</rootfiles></container>')
443     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
444     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
445     if not style:
446         style = get_resource('epub/style.css')
447     zip.write(style, os.path.join('OPS', 'style.css'))
448
449     if cover:
450         if cover is True:
451             cover = DefaultEbookCover
452
453         cover_file = StringIO()
454         bound_cover = cover(document.book_info)
455         bound_cover.save(cover_file)
456         cover_name = 'cover.%s' % bound_cover.ext()
457         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
458         del cover_file
459
460         cover_tree = etree.parse(get_resource('epub/cover.html'))
461         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
462         zip.writestr('OPS/cover.html', etree.tostring(
463                         cover_tree, method="html", pretty_print=True))
464
465         if bound_cover.uses_dc_cover:
466             if document.book_info.cover_by:
467                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
468             if document.book_info.cover_source:
469                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
470
471         manifest.append(etree.fromstring(
472             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
473         manifest.append(etree.fromstring(
474             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
475         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
476         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
477         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
478
479
480     annotations = etree.Element('annotations')
481
482     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
483                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
484                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
485                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
486                                '</navMap></ncx>')
487     nav_map = toc_file[-1]
488
489     if html_toc:
490         manifest.append(etree.fromstring(
491             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
492         spine.append(etree.fromstring(
493             '<itemref idref="html_toc" />'))
494         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
495
496     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
497
498     if len(toc.children) < 2:
499         toc.add(u"Początek utworu", "part1.html")
500
501     # Last modifications in container files and EPUB creation
502     if len(annotations) > 0:
503         toc.add("Przypisy", "annotations.html")
504         manifest.append(etree.fromstring(
505             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
506         spine.append(etree.fromstring(
507             '<itemref idref="annotations" />'))
508         replace_by_verse(annotations)
509         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
510         chars = chars.union(used_chars(html_tree.getroot()))
511         zip.writestr('OPS/annotations.html', etree.tostring(
512                             html_tree, method="html", pretty_print=True))
513
514     toc.add("Wesprzyj Wolne Lektury", "support.html")
515     manifest.append(etree.fromstring(
516         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
517     spine.append(etree.fromstring(
518         '<itemref idref="support" />'))
519     html_string = open(get_resource('epub/support.html')).read()
520     chars.update(used_chars(etree.fromstring(html_string)))
521     zip.writestr('OPS/support.html', html_string)
522
523     toc.add("Strona redakcyjna", "last.html")
524     manifest.append(etree.fromstring(
525         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
526     spine.append(etree.fromstring(
527         '<itemref idref="last" />'))
528     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
529     chars.update(used_chars(html_tree.getroot()))
530     zip.writestr('OPS/last.html', etree.tostring(
531                         html_tree, method="html", pretty_print=True))
532
533     if not flags or not 'without-fonts' in flags:
534         # strip fonts
535         tmpdir = mkdtemp('-librarian-epub')
536         try:
537             cwd = os.getcwd()
538         except OSError:
539             cwd = None
540
541         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
542         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
543             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
544                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
545             if verbose:
546                 print "Running font-optimizer"
547                 subprocess.check_call(optimizer_call)
548             else:
549                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
550             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
551             manifest.append(etree.fromstring(
552                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
553         rmtree(tmpdir)
554         if cwd is not None:
555             os.chdir(cwd)
556
557     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
558     title = document.book_info.title
559     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
560     for st in attributes:
561         meta = toc_file.makeelement(NCXNS('meta'))
562         meta.set('name', st)
563         meta.set('content', '0')
564         toc_file[0].append(meta)
565     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
566     toc_file[0][1].set('content', str(toc.depth()))
567     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
568
569     # write TOC
570     if html_toc:
571         toc.add(u"Spis treści", "toc.html", index=1)
572         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
573     toc.write_to_xml(nav_map)
574     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
575     zip.close()
576
577     return OutputFile.from_filename(output_file.name)