#1032: epubs for virtualo
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from copy import deepcopy
12 from lxml import etree
13 import zipfile
14 from tempfile import mkdtemp
15 from shutil import rmtree
16
17 import sys
18
19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
20 from librarian.dcparser import BookInfo
21
22 from librarian import functions
23
24 functions.reg_person_name()
25
26
27 def inner_xml(node):
28     """ returns node's text and children as a string
29
30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
31     x<b>y</b>z
32     """
33
34     nt = node.text if node.text is not None else ''
35     return ''.join([nt] + [etree.tostring(child) for child in node]) 
36
37 def set_inner_xml(node, text):
38     """ sets node's text and children from a string
39
40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41     >>> set_inner_xml(e, 'x<b>y</b>z')
42     >>> print etree.tostring(e)
43     <a>x<b>y</b>z</a>
44     """
45
46     p = etree.fromstring('<x>%s</x>' % text)
47     node.text = p.text
48     node[:] = p[:]
49
50
51 def node_name(node):
52     """ Find out a node's name
53
54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
55     XYZ
56     """
57
58     tempnode = deepcopy(node)
59
60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61         for e in tempnode.findall('.//%s' % p):
62             t = e.tail
63             e.clear()
64             e.tail = t
65     etree.strip_tags(tempnode, '*')
66     return tempnode.text
67
68
69 def xslt(xml, sheet):
70     if isinstance(xml, etree._Element):
71         xml = etree.ElementTree(xml)
72     with open(sheet) as xsltf:
73         return xml.xslt(etree.parse(xsltf))
74
75
76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
77 def res(fname):
78     return os.path.join(_resdir, fname)
79
80
81 def replace_characters(node):
82     def replace_chars(text):
83         if text is None:
84             return None
85         return text.replace(u"\ufeff", u"")\
86                    .replace("---", u"\u2014")\
87                    .replace("--", u"\u2013")\
88                    .replace(",,", u"\u201E")\
89                    .replace('"', u"\u201D")\
90                    .replace("'", u"\u2019")
91     if node.tag == 'extra':
92         node.clear()
93     else:
94         node.text = replace_chars(node.text)
95         node.tail = replace_chars(node.tail)
96         for child in node:
97             replace_characters(child)
98
99
100 def find_annotations(annotations, source, part_no):
101     for child in source:
102         if child.tag in ('pe', 'pa', 'pt', 'pr'):
103             annotation = deepcopy(child)
104             number = str(len(annotations)+1)
105             annotation.set('number', number)
106             annotation.set('part', str(part_no))
107             annotation.tail = ''
108             annotations.append(annotation)
109             tail = child.tail
110             child.clear()
111             child.tail = tail
112             child.text = number
113         if child.tag not in ('extra', 'podtytul'):
114             find_annotations(annotations, child, part_no)
115
116
117 def replace_by_verse(tree):
118     """ Find stanzas and create new verses in place of a '/' character """
119
120     stanzas = tree.findall('.//' + WLNS('strofa'))
121     for node in stanzas:
122         for child_node in node:
123             if child_node.tag in ('slowo_obce', 'wyroznienie'):
124                 foreign_verses = inner_xml(child_node).split('/\n')
125                 if len(foreign_verses) > 1:
126                     new_foreign = ''
127                     for foreign_verse in foreign_verses:
128                         if foreign_verse.startswith('<wers'):
129                             new_foreign += foreign_verse
130                         else:
131                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
132                     set_inner_xml(child_node, new_foreign)
133         verses = inner_xml(node).split('/\n')
134         if len(verses) > 1:
135             modified_inner_xml = ''
136             for verse in verses:
137                 if verse.startswith('<wers') or verse.startswith('<extra'):
138                     modified_inner_xml += verse
139                 else:
140                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
141             set_inner_xml(node, modified_inner_xml)
142
143
144 def add_to_manifest(manifest, partno):
145     """ Adds a node to the manifest section in content.opf file """
146
147     partstr = 'part%d' % partno
148     e = manifest.makeelement(OPFNS('item'), attrib={
149                                  'id': partstr,
150                                  'href': partstr + '.html',
151                                  'media-type': 'application/xhtml+xml',
152                              })
153     manifest.append(e)
154
155
156 def add_to_spine(spine, partno):
157     """ Adds a node to the spine section in content.opf file """
158
159     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
160     spine.append(e)
161
162
163 class TOC(object):
164     def __init__(self, name=None, part_number=None):
165         self.children = []
166         self.name = name
167         self.part_number = part_number
168         self.sub_number = None
169
170     def add(self, name, part_number, level=0, is_part=True):
171         if level > 0 and self.children:
172             return self.children[-1].add(name, part_number, level-1, is_part)
173         else:
174             t = TOC(name)
175             t.part_number = part_number
176             self.children.append(t)
177             if not is_part:
178                 t.sub_number = len(self.children) + 1
179                 return t.sub_number
180
181     def append(self, toc):
182         self.children.append(toc)
183
184     def extend(self, toc):
185         self.children.extend(toc.children)
186
187     def depth(self):
188         if self.children:
189             return max((c.depth() for c in self.children)) + 1
190         else:
191             return 0
192
193     def write_to_xml(self, nav_map, counter):
194         for child in self.children:
195             nav_point = nav_map.makeelement(NCXNS('navPoint'))
196             nav_point.set('id', 'NavPoint-%d' % counter)
197             nav_point.set('playOrder', str(counter))
198
199             nav_label = nav_map.makeelement(NCXNS('navLabel'))
200             text = nav_map.makeelement(NCXNS('text'))
201             text.text = child.name
202             nav_label.append(text)
203             nav_point.append(nav_label)
204
205             content = nav_map.makeelement(NCXNS('content'))
206             src = 'part%d.html' % child.part_number
207             if child.sub_number is not None:
208                 src += '#sub%d' % child.sub_number
209             content.set('src', src)
210             nav_point.append(content)
211             nav_map.append(nav_point)
212             counter = child.write_to_xml(nav_point, counter + 1)
213         return counter
214
215
216 def used_chars(element):
217     """ Lists characters used in an ETree Element """
218     chars = set((element.text or '') + (element.tail or ''))
219     for child in element:
220         chars = chars.union(used_chars(child))
221     return chars
222
223
224 def chop(main_text):
225     """ divide main content of the XML file into chunks """
226
227     # prepare a container for each chunk
228     part_xml = etree.Element('utwor')
229     etree.SubElement(part_xml, 'master')
230     main_xml_part = part_xml[0] # master
231
232     last_node_part = False
233     for one_part in main_text:
234         name = one_part.tag
235         if name == 'naglowek_czesc':
236             yield part_xml
237             last_node_part = True
238             main_xml_part[:] = [deepcopy(one_part)]
239         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
240             yield part_xml
241             main_xml_part[:] = [deepcopy(one_part)]
242         else:
243             main_xml_part.append(deepcopy(one_part))
244             last_node_part = False
245     yield part_xml
246
247
248 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
249     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
250
251     toc = TOC()
252     for element in chunk_xml[0]:
253         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
254             toc.add(node_name(element), chunk_no)
255         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
256             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
257             element.set('sub', str(subnumber))
258     if empty:
259         if not _empty_html_static:
260             _empty_html_static.append(open(res('emptyChunk.html')).read())
261         chars = set()
262         output_html = _empty_html_static[0]
263     else:
264         find_annotations(annotations, chunk_xml, chunk_no)
265         replace_by_verse(chunk_xml)
266         html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
267         chars = used_chars(html_tree.getroot())
268         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
269     return output_html, toc, chars
270
271
272 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None):
273     """ produces a EPUB file
274
275     provider: a DocProvider
276     slug: slug of file to process, available by provider
277     output_file: file-like object or path to output file
278     output_dir: path to directory to save output file to; either this or output_file must be present
279     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
280     sample=n: generate sample e-book (with at least n paragraphs)
281     """
282
283     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
284         """ processes one input file and proceeds to its children """
285
286         replace_characters(input_xml.getroot())
287
288         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
289
290         # every input file will have a TOC entry,
291         # pointing to starting chunk
292         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
293         chars = set()
294         if first:
295             # write book title page
296             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
297             chars = used_chars(html_tree.getroot())
298             zip.writestr('OPS/title.html',
299                  etree.tostring(html_tree, method="html", pretty_print=True))
300         elif children:
301             # write title page for every parent
302             if sample is not None and sample <= 0:
303                 chars = set()
304                 html_string = open(res('emptyChunk.html')).read()
305             else:
306                 html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
307                 chars = used_chars(html_tree.getroot())
308                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
309             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
310             add_to_manifest(manifest, chunk_counter)
311             add_to_spine(spine, chunk_counter)
312             chunk_counter += 1
313
314         if len(input_xml.getroot()) > 1:
315             # rdf before style master
316             main_text = input_xml.getroot()[1]
317         else:
318             # rdf in style master
319             main_text = input_xml.getroot()[0]
320             if main_text.tag == RDFNS('RDF'):
321                 main_text = None
322
323         if main_text is not None:
324             for chunk_xml in chop(main_text):
325                 empty = False
326                 if sample is not None:
327                     if sample <= 0:
328                         empty = True
329                     else:
330                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
331                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
332
333                 toc.extend(chunk_toc)
334                 chars = chars.union(chunk_chars)
335                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
336                 add_to_manifest(manifest, chunk_counter)
337                 add_to_spine(spine, chunk_counter)
338                 chunk_counter += 1
339
340         if children:
341             for child in children:
342                 child_xml = etree.parse(provider.by_uri(child))
343                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
344                 toc.append(child_toc)
345                 chars = chars.union(chunk_chars)
346
347         return toc, chunk_counter, chars, sample
348
349     # read metadata from the first file
350     if file_path:
351         if slug:
352             raise ValueError('slug or file_path should be specified, not both')
353         f = open(file_path, 'r')
354         input_xml = etree.parse(f)
355         f.close()
356     else:
357         if not slug:
358             raise ValueError('either slug or file_path should be specified')
359         input_xml = etree.parse(provider[slug])
360
361     metadata = input_xml.find('.//'+RDFNS('Description'))
362     if metadata is None:
363         raise NoDublinCore('Document has no DublinCore - which is required.')
364     book_info = BookInfo.from_element(input_xml)
365     metadata = etree.ElementTree(metadata)
366
367     # if output to dir, create the file
368     if output_dir is not None:
369         if make_dir:
370             author = unicode(book_info.author)
371             output_dir = os.path.join(output_dir, author)
372             try:
373                 os.makedirs(output_dir)
374             except OSError:
375                 pass
376         if slug:
377             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
378         else:
379             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
380
381     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
382
383     # write static elements
384     mime = zipfile.ZipInfo()
385     mime.filename = 'mimetype'
386     mime.compress_type = zipfile.ZIP_STORED
387     mime.extra = ''
388     zip.writestr(mime, 'application/epub+zip')
389     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
390                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
391                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
392                        'media-type="application/oebps-package+xml" />' \
393                        '</rootfiles></container>')
394     for fname in 'style.css', 'logo_wolnelektury.png':
395         zip.write(res(fname), os.path.join('OPS', fname))
396
397     opf = xslt(metadata, res('xsltContent.xsl'))
398     manifest = opf.find('.//' + OPFNS('manifest'))
399     spine = opf.find('.//' + OPFNS('spine'))
400
401     annotations = etree.Element('annotations')
402
403     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
404                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
405                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
406                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
407                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
408                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
409                                '</navPoint></navMap></ncx>')
410     nav_map = toc_file[-1]
411
412     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
413
414     if not toc.children:
415         toc.add(u"Początek utworu", 1)
416     toc_counter = toc.write_to_xml(nav_map, 2)
417
418     # Last modifications in container files and EPUB creation
419     if len(annotations) > 0:
420         nav_map.append(etree.fromstring(
421             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
422             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
423         manifest.append(etree.fromstring(
424             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
425         spine.append(etree.fromstring(
426             '<itemref idref="annotations" />'))
427         replace_by_verse(annotations)
428         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
429         chars = chars.union(used_chars(html_tree.getroot()))
430         zip.writestr('OPS/annotations.html', etree.tostring(
431                             html_tree, method="html", pretty_print=True))
432
433     # strip fonts
434     tmpdir = mkdtemp('-librarian-epub')
435     cwd = os.getcwd()
436
437     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
438     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
439         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)]
440         if verbose:
441             print "Running font-optimizer"
442             subprocess.check_call(optimizer_call)
443         else:
444             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
445         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
446     rmtree(tmpdir)
447     os.chdir(cwd)
448
449     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
450     contents = []
451     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
452     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
453     for st in attributes:
454         meta = toc_file.makeelement(NCXNS('meta'))
455         meta.set('name', st)
456         meta.set('content', '0')
457         toc_file[0].append(meta)
458     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
459     toc_file[0][1].set('content', str(toc.depth()))
460     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
461     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
462     zip.close()