frame margins in epub/mobi
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import re
11 import subprocess
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
15
16 from lxml import etree
17 import zipfile
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
20
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
22 from librarian.cover import WLCover
23
24 from librarian import functions, get_resource
25
26 functions.reg_person_name()
27
28
29 def inner_xml(node):
30     """ returns node's text and children as a string
31
32     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
33     x<b>y</b>z
34     """
35
36     nt = node.text if node.text is not None else ''
37     return ''.join([nt] + [etree.tostring(child) for child in node])
38
39 def set_inner_xml(node, text):
40     """ sets node's text and children from a string
41
42     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
43     >>> set_inner_xml(e, 'x<b>y</b>z')
44     >>> print etree.tostring(e)
45     <a>x<b>y</b>z</a>
46     """
47
48     p = etree.fromstring('<x>%s</x>' % text)
49     node.text = p.text
50     node[:] = p[:]
51
52
53 def node_name(node):
54     """ Find out a node's name
55
56     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
57     XYZ
58     """
59
60     tempnode = deepcopy(node)
61
62     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
63         for e in tempnode.findall('.//%s' % p):
64             t = e.tail
65             e.clear()
66             e.tail = t
67     etree.strip_tags(tempnode, '*')
68     return tempnode.text
69
70
71 def xslt(xml, sheet):
72     if isinstance(xml, etree._Element):
73         xml = etree.ElementTree(xml)
74     with open(sheet) as xsltf:
75         return xml.xslt(etree.parse(xsltf))
76
77
78 def replace_characters(node):
79     def replace_chars(text):
80         if text is None:
81             return None
82         return text.replace(u"\ufeff", u"")\
83                    .replace("---", u"\u2014")\
84                    .replace("--", u"\u2013")\
85                    .replace(",,", u"\u201E")\
86                    .replace('"', u"\u201D")\
87                    .replace("'", u"\u2019")
88     if node.tag in ('uwaga', 'extra'):
89         t = node.tail
90         node.clear()
91         node.tail = t
92     node.text = replace_chars(node.text)
93     node.tail = replace_chars(node.tail)
94     for child in node:
95         replace_characters(child)
96
97
98 def find_annotations(annotations, source, part_no):
99     for child in source:
100         if child.tag in ('pe', 'pa', 'pt', 'pr'):
101             annotation = deepcopy(child)
102             number = str(len(annotations)+1)
103             annotation.set('number', number)
104             annotation.set('part', str(part_no))
105             annotation.tail = ''
106             annotations.append(annotation)
107             tail = child.tail
108             child.clear()
109             child.tail = tail
110             child.text = number
111         if child.tag not in ('extra', 'uwaga'):
112             find_annotations(annotations, child, part_no)
113
114
115 class Stanza(object):
116     """
117     Converts / verse endings into verse elements in a stanza.
118
119     Slashes may only occur directly in the stanza. Any slashes in subelements
120     will be ignored, and the subelements will be put inside verse elements.
121
122     >>> s = etree.fromstring("<strofa>a/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
123     >>> Stanza(s).versify()
124     >>> print etree.tostring(s)
125     <strofa><wers_normalny>a</wers_normalny><wers_normalny>b<x>x/
126     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
127     
128     """
129     def __init__(self, stanza_elem):
130         self.stanza = stanza_elem
131         self.verses = []
132         self.open_verse = None
133
134     def versify(self):
135         self.push_text(self.stanza.text)
136         for elem in self.stanza:
137             self.push_elem(elem)
138             self.push_text(elem.tail)
139         tail = self.stanza.tail
140         self.stanza.clear()
141         self.stanza.tail = tail
142         self.stanza.extend(self.verses)
143
144     def open_normal_verse(self):
145         self.open_verse = self.stanza.makeelement("wers_normalny")
146         self.verses.append(self.open_verse)
147
148     def get_open_verse(self):
149         if self.open_verse is None:
150             self.open_normal_verse()
151         return self.open_verse
152
153     def push_text(self, text):
154         if not text or not text.strip():
155             return
156         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
157             if i:
158                 self.open_normal_verse()
159             verse = self.get_open_verse()
160             if len(verse):
161                 verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
162             else:
163                 verse.text = (verse.text or "") + verse_text.strip()
164
165     def push_elem(self, elem):
166         if elem.tag.startswith("wers"):
167             verse = deepcopy(elem)
168             verse.tail = None
169             self.verses.append(verse)
170             self.open_verse = verse
171         else:
172             appended = deepcopy(elem)
173             appended.tail = None
174             self.get_open_verse().append(appended)
175
176
177 def replace_by_verse(tree):
178     """ Find stanzas and create new verses in place of a '/' character """
179
180     stanzas = tree.findall('.//' + WLNS('strofa'))
181     for stanza in stanzas:
182         Stanza(stanza).versify()
183
184
185 def add_to_manifest(manifest, partno):
186     """ Adds a node to the manifest section in content.opf file """
187
188     partstr = 'part%d' % partno
189     e = manifest.makeelement(OPFNS('item'), attrib={
190                                  'id': partstr,
191                                  'href': partstr + '.html',
192                                  'media-type': 'application/xhtml+xml',
193                              })
194     manifest.append(e)
195
196
197 def add_to_spine(spine, partno):
198     """ Adds a node to the spine section in content.opf file """
199
200     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
201     spine.append(e)
202
203
204 class TOC(object):
205     def __init__(self, name=None, part_href=None):
206         self.children = []
207         self.name = name
208         self.part_href = part_href
209         self.sub_number = None
210
211     def add(self, name, part_href, level=0, is_part=True, index=None):
212         assert level == 0 or index is None
213         if level > 0 and self.children:
214             return self.children[-1].add(name, part_href, level-1, is_part)
215         else:
216             t = TOC(name)
217             t.part_href = part_href
218             if index is not None:
219                 self.children.insert(index, t)
220             else:
221                 self.children.append(t)
222             if not is_part:
223                 t.sub_number = len(self.children) + 1
224                 return t.sub_number
225
226     def append(self, toc):
227         self.children.append(toc)
228
229     def extend(self, toc):
230         self.children.extend(toc.children)
231
232     def depth(self):
233         if self.children:
234             return max((c.depth() for c in self.children)) + 1
235         else:
236             return 0
237
238     def href(self):
239         src = self.part_href
240         if self.sub_number is not None:
241             src += '#sub%d' % self.sub_number
242         return src
243
244     def write_to_xml(self, nav_map, counter=1):
245         for child in self.children:
246             nav_point = nav_map.makeelement(NCXNS('navPoint'))
247             nav_point.set('id', 'NavPoint-%d' % counter)
248             nav_point.set('playOrder', str(counter))
249
250             nav_label = nav_map.makeelement(NCXNS('navLabel'))
251             text = nav_map.makeelement(NCXNS('text'))
252             text.text = child.name
253             nav_label.append(text)
254             nav_point.append(nav_label)
255
256             content = nav_map.makeelement(NCXNS('content'))
257             content.set('src', child.href())
258             nav_point.append(content)
259             nav_map.append(nav_point)
260             counter = child.write_to_xml(nav_point, counter + 1)
261         return counter
262
263     def html_part(self, depth=0):
264         texts = []
265         for child in self.children:
266             texts.append(
267                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
268                 (depth, child.href(), child.name))
269             texts.append(child.html_part(depth+1))
270         return "\n".join(texts)
271
272     def html(self):
273         with open(get_resource('epub/toc.html')) as f:
274             t = unicode(f.read(), 'utf-8')
275         return t % self.html_part()
276
277
278 def used_chars(element):
279     """ Lists characters used in an ETree Element """
280     chars = set((element.text or '') + (element.tail or ''))
281     for child in element:
282         chars = chars.union(used_chars(child))
283     return chars
284
285
286 def chop(main_text):
287     """ divide main content of the XML file into chunks """
288
289     # prepare a container for each chunk
290     part_xml = etree.Element('utwor')
291     etree.SubElement(part_xml, 'master')
292     main_xml_part = part_xml[0] # master
293
294     last_node_part = False
295     for one_part in main_text:
296         name = one_part.tag
297         if name == 'naglowek_czesc':
298             yield part_xml
299             last_node_part = True
300             main_xml_part[:] = [deepcopy(one_part)]
301         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
302             yield part_xml
303             main_xml_part[:] = [deepcopy(one_part)]
304         else:
305             main_xml_part.append(deepcopy(one_part))
306             last_node_part = False
307     yield part_xml
308
309
310 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
311     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
312
313     toc = TOC()
314     for element in chunk_xml[0]:
315         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
316             toc.add(node_name(element), "part%d.html" % chunk_no)
317         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
318             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
319             element.set('sub', str(subnumber))
320     if empty:
321         if not _empty_html_static:
322             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
323         chars = set()
324         output_html = _empty_html_static[0]
325     else:
326         find_annotations(annotations, chunk_xml, chunk_no)
327         replace_by_verse(chunk_xml)
328         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
329         chars = used_chars(html_tree.getroot())
330         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
331     return output_html, toc, chars
332
333
334 def transform(wldoc, verbose=False, style=None, html_toc=False,
335               sample=None, cover=None, flags=None, ilustr_path=''):
336     """ produces a EPUB file
337
338     sample=n: generate sample e-book (with at least n paragraphs)
339     cover: a cover.Cover factory or True for default
340     flags: less-advertising, without-fonts, working-copy
341     """
342
343     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
344         """ processes one input file and proceeds to its children """
345
346         replace_characters(wldoc.edoc.getroot())
347
348         # every input file will have a TOC entry,
349         # pointing to starting chunk
350         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
351         chars = set()
352         if first:
353             # write book title page
354             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
355             chars = used_chars(html_tree.getroot())
356             zip.writestr('OPS/title.html',
357                  etree.tostring(html_tree, method="html", pretty_print=True))
358             # add a title page TOC entry
359             toc.add(u"Strona tytułowa", "title.html")
360         elif wldoc.book_info.parts:
361             # write title page for every parent
362             if sample is not None and sample <= 0:
363                 chars = set()
364                 html_string = open(get_resource('epub/emptyChunk.html')).read()
365             else:
366                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
367                 chars = used_chars(html_tree.getroot())
368                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
369             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
370             add_to_manifest(manifest, chunk_counter)
371             add_to_spine(spine, chunk_counter)
372             chunk_counter += 1
373
374         if len(wldoc.edoc.getroot()) > 1:
375             # rdf before style master
376             main_text = wldoc.edoc.getroot()[1]
377         else:
378             # rdf in style master
379             main_text = wldoc.edoc.getroot()[0]
380             if main_text.tag == RDFNS('RDF'):
381                 main_text = None
382
383         if main_text is not None:
384             for chunk_xml in chop(main_text):
385                 empty = False
386                 if sample is not None:
387                     if sample <= 0:
388                         empty = True
389                     else:
390                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
391                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
392
393                 toc.extend(chunk_toc)
394                 chars = chars.union(chunk_chars)
395                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
396                 add_to_manifest(manifest, chunk_counter)
397                 add_to_spine(spine, chunk_counter)
398                 chunk_counter += 1
399
400         for child in wldoc.parts():
401             child_toc, chunk_counter, chunk_chars, sample = transform_file(
402                 child, chunk_counter, first=False, sample=sample)
403             toc.append(child_toc)
404             chars = chars.union(chunk_chars)
405
406         return toc, chunk_counter, chars, sample
407
408
409     document = deepcopy(wldoc)
410     del wldoc
411
412     if flags:
413         for flag in flags:
414             document.edoc.getroot().set(flag, 'yes')
415
416     # add editors info
417     document.edoc.getroot().set('editors', u', '.join(sorted(
418         editor.readable() for editor in document.editors())))
419
420     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
421     manifest = opf.find('.//' + OPFNS('manifest'))
422     guide = opf.find('.//' + OPFNS('guide'))
423     spine = opf.find('.//' + OPFNS('spine'))
424
425     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
426     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
427
428     if os.path.isdir(ilustr_path):
429         for i, filename in enumerate(os.listdir(ilustr_path)):
430             file_path = os.path.join(ilustr_path, filename)
431             zip.write(file_path, os.path.join('OPS', filename))
432             image_id = 'image%s' % i
433             manifest.append(etree.fromstring(
434                 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
435
436     # write static elements
437     mime = zipfile.ZipInfo()
438     mime.filename = 'mimetype'
439     mime.compress_type = zipfile.ZIP_STORED
440     mime.extra = ''
441     zip.writestr(mime, 'application/epub+zip')
442     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
443                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
444                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
445                        'media-type="application/oebps-package+xml" />' \
446                        '</rootfiles></container>')
447     #zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
448     zip.write(get_resource('res/koedlogo.png'), os.path.join('OPS', 'logo_koed.png'))
449     #zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
450     if not style:
451         style = get_resource('epub/style.css')
452     zip.write(style, os.path.join('OPS', 'style.css'))
453
454     if cover is None:
455         cover = WLCover
456     if cover:
457         if cover is True:
458             cover = WLCover
459
460         cover_file = StringIO()
461         bound_cover = cover(document.book_info)
462         bound_cover.save(cover_file)
463         cover_name = 'cover.%s' % bound_cover.ext()
464         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
465         del cover_file
466
467         cover_tree = etree.parse(get_resource('epub/cover.html'))
468         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
469         zip.writestr('OPS/cover.html', etree.tostring(
470                         cover_tree, method="html", pretty_print=True))
471
472         if bound_cover.uses_dc_cover:
473             if document.book_info.cover_by:
474                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
475             if document.book_info.cover_source:
476                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
477
478         manifest.append(etree.fromstring(
479             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
480         manifest.append(etree.fromstring(
481             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
482         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
483         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
484         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
485
486
487     annotations = etree.Element('annotations')
488
489     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
490                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
491                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
492                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
493                                '</navMap></ncx>')
494     nav_map = toc_file[-1]
495
496     if html_toc:
497         manifest.append(etree.fromstring(
498             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
499         spine.append(etree.fromstring(
500             '<itemref idref="html_toc" />'))
501         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
502
503     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
504
505     if len(toc.children) < 2:
506         toc.add(u"Początek utworu", "part1.html")
507
508     # Last modifications in container files and EPUB creation
509     if len(annotations) > 0:
510         toc.add("Przypisy", "annotations.html")
511         manifest.append(etree.fromstring(
512             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
513         spine.append(etree.fromstring(
514             '<itemref idref="annotations" />'))
515         replace_by_verse(annotations)
516         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
517         chars = chars.union(used_chars(html_tree.getroot()))
518         zip.writestr('OPS/annotations.html', etree.tostring(
519                             html_tree, method="html", pretty_print=True))
520
521     toc.add("Strona redakcyjna", "last.html")
522     manifest.append(etree.fromstring(
523         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
524     spine.append(etree.fromstring(
525         '<itemref idref="last" />'))
526     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
527     chars.update(used_chars(html_tree.getroot()))
528     zip.writestr('OPS/last.html', etree.tostring(
529                         html_tree, method="html", pretty_print=True))
530
531     if not flags or not 'without-fonts' in flags:
532         # strip fonts
533         tmpdir = mkdtemp('-librarian-epub')
534         try:
535             cwd = os.getcwd()
536         except OSError:
537             cwd = None
538
539         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
540         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
541             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
542                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
543             if verbose:
544                 print "Running font-optimizer"
545                 subprocess.check_call(optimizer_call)
546             else:
547                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
548             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
549             manifest.append(etree.fromstring(
550                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
551         rmtree(tmpdir)
552         if cwd is not None:
553             os.chdir(cwd)
554
555     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
556     title = document.book_info.title
557     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
558     for st in attributes:
559         meta = toc_file.makeelement(NCXNS('meta'))
560         meta.set('name', st)
561         meta.set('content', '0')
562         toc_file[0].append(meta)
563     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
564     toc_file[0][1].set('content', str(toc.depth()))
565     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
566
567     # write TOC
568     if html_toc:
569         toc.add(u"Spis treści", "toc.html", index=1)
570         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
571     toc.write_to_xml(nav_map)
572     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
573     zip.close()
574
575     return OutputFile.from_filename(output_file.name)