book2mobi (using calibre)
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15 from tempfile import mkdtemp
16 from shutil import rmtree
17
18 import sys
19
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
21 from librarian.dcparser import BookInfo
22
23 from librarian import functions, get_resource
24
25 functions.reg_person_name()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node])
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace(u"\ufeff", u"")\
82                    .replace("---", u"\u2014")\
83                    .replace("--", u"\u2013")\
84                    .replace(",,", u"\u201E")\
85                    .replace('"', u"\u201D")\
86                    .replace("'", u"\u2019")
87     if node.tag in ('uwaga', 'extra'):
88         t = node.tail
89         node.clear()
90         node.tail = t
91     node.text = replace_chars(node.text)
92     node.tail = replace_chars(node.tail)
93     for child in node:
94         replace_characters(child)
95
96
97 def find_annotations(annotations, source, part_no):
98     for child in source:
99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
100             annotation = deepcopy(child)
101             number = str(len(annotations)+1)
102             annotation.set('number', number)
103             annotation.set('part', str(part_no))
104             annotation.tail = ''
105             annotations.append(annotation)
106             tail = child.tail
107             child.clear()
108             child.tail = tail
109             child.text = number
110         if child.tag not in ('extra', 'uwaga'):
111             find_annotations(annotations, child, part_no)
112
113
114 def replace_by_verse(tree):
115     """ Find stanzas and create new verses in place of a '/' character """
116
117     stanzas = tree.findall('.//' + WLNS('strofa'))
118     for node in stanzas:
119         for child_node in node:
120             if child_node.tag in ('slowo_obce', 'wyroznienie'):
121                 foreign_verses = inner_xml(child_node).split('/\n')
122                 if len(foreign_verses) > 1:
123                     new_foreign = ''
124                     for foreign_verse in foreign_verses:
125                         if foreign_verse.startswith('<wers'):
126                             new_foreign += foreign_verse
127                         else:
128                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
129                     set_inner_xml(child_node, new_foreign)
130         verses = inner_xml(node).split('/\n')
131         if len(verses) > 1:
132             modified_inner_xml = ''
133             for verse in verses:
134                 if verse.startswith('<wers') or verse.startswith('<extra'):
135                     modified_inner_xml += verse
136                 else:
137                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
138             set_inner_xml(node, modified_inner_xml)
139
140
141 def add_to_manifest(manifest, partno):
142     """ Adds a node to the manifest section in content.opf file """
143
144     partstr = 'part%d' % partno
145     e = manifest.makeelement(OPFNS('item'), attrib={
146                                  'id': partstr,
147                                  'href': partstr + '.html',
148                                  'media-type': 'application/xhtml+xml',
149                              })
150     manifest.append(e)
151
152
153 def add_to_spine(spine, partno):
154     """ Adds a node to the spine section in content.opf file """
155
156     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
157     spine.append(e)
158
159
160 class TOC(object):
161     def __init__(self, name=None, part_number=None):
162         self.children = []
163         self.name = name
164         self.part_number = part_number
165         self.sub_number = None
166
167     def add(self, name, part_number, level=0, is_part=True):
168         if level > 0 and self.children:
169             return self.children[-1].add(name, part_number, level-1, is_part)
170         else:
171             t = TOC(name)
172             t.part_number = part_number
173             self.children.append(t)
174             if not is_part:
175                 t.sub_number = len(self.children) + 1
176                 return t.sub_number
177
178     def append(self, toc):
179         self.children.append(toc)
180
181     def extend(self, toc):
182         self.children.extend(toc.children)
183
184     def depth(self):
185         if self.children:
186             return max((c.depth() for c in self.children)) + 1
187         else:
188             return 0
189
190     def write_to_xml(self, nav_map, counter):
191         for child in self.children:
192             nav_point = nav_map.makeelement(NCXNS('navPoint'))
193             nav_point.set('id', 'NavPoint-%d' % counter)
194             nav_point.set('playOrder', str(counter))
195
196             nav_label = nav_map.makeelement(NCXNS('navLabel'))
197             text = nav_map.makeelement(NCXNS('text'))
198             text.text = child.name
199             nav_label.append(text)
200             nav_point.append(nav_label)
201
202             content = nav_map.makeelement(NCXNS('content'))
203             src = 'part%d.html' % child.part_number
204             if child.sub_number is not None:
205                 src += '#sub%d' % child.sub_number
206             content.set('src', src)
207             nav_point.append(content)
208             nav_map.append(nav_point)
209             counter = child.write_to_xml(nav_point, counter + 1)
210         return counter
211
212
213 def used_chars(element):
214     """ Lists characters used in an ETree Element """
215     chars = set((element.text or '') + (element.tail or ''))
216     for child in element:
217         chars = chars.union(used_chars(child))
218     return chars
219
220
221 def chop(main_text):
222     """ divide main content of the XML file into chunks """
223
224     # prepare a container for each chunk
225     part_xml = etree.Element('utwor')
226     etree.SubElement(part_xml, 'master')
227     main_xml_part = part_xml[0] # master
228
229     last_node_part = False
230     for one_part in main_text:
231         name = one_part.tag
232         if name == 'naglowek_czesc':
233             yield part_xml
234             last_node_part = True
235             main_xml_part[:] = [deepcopy(one_part)]
236         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
237             yield part_xml
238             main_xml_part[:] = [deepcopy(one_part)]
239         else:
240             main_xml_part.append(deepcopy(one_part))
241             last_node_part = False
242     yield part_xml
243
244
245 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
246     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
247
248     toc = TOC()
249     for element in chunk_xml[0]:
250         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
251             toc.add(node_name(element), chunk_no)
252         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
253             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
254             element.set('sub', str(subnumber))
255     if empty:
256         if not _empty_html_static:
257             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
258         chars = set()
259         output_html = _empty_html_static[0]
260     else:
261         find_annotations(annotations, chunk_xml, chunk_no)
262         replace_by_verse(chunk_xml)
263         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
264         chars = used_chars(html_tree.getroot())
265         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
266     return output_html, toc, chars
267
268
269 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
270               style=None,
271               sample=None, cover=None, flags=None):
272     """ produces a EPUB file
273
274     provider: a DocProvider
275     slug: slug of file to process, available by provider
276     output_file: file-like object or path to output file
277     output_dir: path to directory to save output file to; either this or output_file must be present
278     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
279     sample=n: generate sample e-book (with at least n paragraphs)
280     cover: a cover.Cover object
281     flags: less-advertising, without-fonts
282     """
283
284     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
285         """ processes one input file and proceeds to its children """
286
287         replace_characters(input_xml.getroot())
288
289         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
290
291         # every input file will have a TOC entry,
292         # pointing to starting chunk
293         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
294         chars = set()
295         if first:
296             # write book title page
297             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
298             chars = used_chars(html_tree.getroot())
299             zip.writestr('OPS/title.html',
300                  etree.tostring(html_tree, method="html", pretty_print=True))
301         elif children:
302             # write title page for every parent
303             if sample is not None and sample <= 0:
304                 chars = set()
305                 html_string = open(get_resource('epub/emptyChunk.html')).read()
306             else:
307                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
308                 chars = used_chars(html_tree.getroot())
309                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
310             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
311             add_to_manifest(manifest, chunk_counter)
312             add_to_spine(spine, chunk_counter)
313             chunk_counter += 1
314
315         if len(input_xml.getroot()) > 1:
316             # rdf before style master
317             main_text = input_xml.getroot()[1]
318         else:
319             # rdf in style master
320             main_text = input_xml.getroot()[0]
321             if main_text.tag == RDFNS('RDF'):
322                 main_text = None
323
324         if main_text is not None:
325             for chunk_xml in chop(main_text):
326                 empty = False
327                 if sample is not None:
328                     if sample <= 0:
329                         empty = True
330                     else:
331                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
332                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
333
334                 toc.extend(chunk_toc)
335                 chars = chars.union(chunk_chars)
336                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
337                 add_to_manifest(manifest, chunk_counter)
338                 add_to_spine(spine, chunk_counter)
339                 chunk_counter += 1
340
341         if children:
342             for child in children:
343                 child_xml = etree.parse(provider.by_uri(child))
344                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
345                 toc.append(child_toc)
346                 chars = chars.union(chunk_chars)
347
348         return toc, chunk_counter, chars, sample
349
350     # read metadata from the first file
351     if file_path:
352         if slug:
353             raise ValueError('slug or file_path should be specified, not both')
354         f = open(file_path, 'r')
355         input_xml = etree.parse(f)
356         f.close()
357     else:
358         if not slug:
359             raise ValueError('either slug or file_path should be specified')
360         input_xml = etree.parse(provider[slug])
361
362     if flags:
363         for flag in flags:
364             input_xml.getroot().set(flag, 'yes')
365
366     metadata = input_xml.find('.//'+RDFNS('Description'))
367     if metadata is None:
368         raise NoDublinCore('Document has no DublinCore - which is required.')
369     book_info = BookInfo.from_element(input_xml)
370     metadata = etree.ElementTree(metadata)
371
372     # if output to dir, create the file
373     if output_dir is not None:
374         if make_dir:
375             author = unicode(book_info.author)
376             output_dir = os.path.join(output_dir, author)
377             try:
378                 os.makedirs(output_dir)
379             except OSError:
380                 pass
381         if slug:
382             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
383         else:
384             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
385
386     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
387
388     # write static elements
389     mime = zipfile.ZipInfo()
390     mime.filename = 'mimetype'
391     mime.compress_type = zipfile.ZIP_STORED
392     mime.extra = ''
393     zip.writestr(mime, 'application/epub+zip')
394     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
395                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
396                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
397                        'media-type="application/oebps-package+xml" />' \
398                        '</rootfiles></container>')
399     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
400     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
401     if not style:
402         style = get_resource('epub/style.css')
403     zip.write(style, os.path.join('OPS', 'style.css'))
404
405     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
406     manifest = opf.find('.//' + OPFNS('manifest'))
407     spine = opf.find('.//' + OPFNS('spine'))
408
409     if cover:
410         cover_file = StringIO()
411         c = cover(book_info.author.readable(), book_info.title)
412         c.save(cover_file)
413         c_name = 'cover.%s' % c.ext()
414         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
415         del cover_file
416
417         cover_tree = etree.parse(get_resource('epub/cover.html'))
418         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
419         zip.writestr('OPS/cover.html', etree.tostring(
420                         cover_tree, method="html", pretty_print=True))
421
422         manifest.append(etree.fromstring(
423             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
424         manifest.append(etree.fromstring(
425             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
426         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
427         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
428         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
429
430
431     annotations = etree.Element('annotations')
432
433     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
434                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
435                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
436                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
437                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
438                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
439                                '</navPoint></navMap></ncx>')
440     nav_map = toc_file[-1]
441
442     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
443
444     if not toc.children:
445         toc.add(u"Początek utworu", 1)
446     toc_counter = toc.write_to_xml(nav_map, 2)
447
448     # Last modifications in container files and EPUB creation
449     if len(annotations) > 0:
450         nav_map.append(etree.fromstring(
451             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
452             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
453         toc_counter += 1
454         manifest.append(etree.fromstring(
455             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
456         spine.append(etree.fromstring(
457             '<itemref idref="annotations" />'))
458         replace_by_verse(annotations)
459         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
460         chars = chars.union(used_chars(html_tree.getroot()))
461         zip.writestr('OPS/annotations.html', etree.tostring(
462                             html_tree, method="html", pretty_print=True))
463
464     nav_map.append(etree.fromstring(
465         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
466         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
467     manifest.append(etree.fromstring(
468         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
469     spine.append(etree.fromstring(
470         '<itemref idref="last" />'))
471     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
472     chars.update(used_chars(html_tree.getroot()))
473     zip.writestr('OPS/last.html', etree.tostring(
474                         html_tree, method="html", pretty_print=True))
475
476     if not flags or not 'without-fonts' in flags:
477         # strip fonts
478         tmpdir = mkdtemp('-librarian-epub')
479         cwd = os.getcwd()
480
481         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
482         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
483             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
484                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
485             if verbose:
486                 print "Running font-optimizer"
487                 subprocess.check_call(optimizer_call)
488             else:
489                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
490             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
491             manifest.append(etree.fromstring(
492                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
493         rmtree(tmpdir)
494         os.chdir(cwd)
495
496     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
497     contents = []
498     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
499     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
500     for st in attributes:
501         meta = toc_file.makeelement(NCXNS('meta'))
502         meta.set('name', st)
503         meta.set('content', '0')
504         toc_file[0].append(meta)
505     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
506     toc_file[0][1].set('content', str(toc.depth()))
507     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
508     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
509     zip.close()