ad84ab034cefceff0a3ccc399309c2f92acefe3f
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15 from tempfile import mkdtemp
16 from shutil import rmtree
17
18 import sys
19
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
21 from librarian.dcparser import BookInfo
22
23 from librarian import functions, get_resource
24
25 functions.reg_person_name()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node]) 
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace(u"\ufeff", u"")\
82                    .replace("---", u"\u2014")\
83                    .replace("--", u"\u2013")\
84                    .replace(",,", u"\u201E")\
85                    .replace('"', u"\u201D")\
86                    .replace("'", u"\u2019")
87     if node.tag == 'extra':
88         node.clear()
89     else:
90         node.text = replace_chars(node.text)
91         node.tail = replace_chars(node.tail)
92         for child in node:
93             replace_characters(child)
94
95
96 def find_annotations(annotations, source, part_no):
97     for child in source:
98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
99             annotation = deepcopy(child)
100             number = str(len(annotations)+1)
101             annotation.set('number', number)
102             annotation.set('part', str(part_no))
103             annotation.tail = ''
104             annotations.append(annotation)
105             tail = child.tail
106             child.clear()
107             child.tail = tail
108             child.text = number
109         if child.tag not in ('extra',):
110             find_annotations(annotations, child, part_no)
111
112
113 def replace_by_verse(tree):
114     """ Find stanzas and create new verses in place of a '/' character """
115
116     stanzas = tree.findall('.//' + WLNS('strofa'))
117     for node in stanzas:
118         for child_node in node:
119             if child_node.tag in ('slowo_obce', 'wyroznienie'):
120                 foreign_verses = inner_xml(child_node).split('/\n')
121                 if len(foreign_verses) > 1:
122                     new_foreign = ''
123                     for foreign_verse in foreign_verses:
124                         if foreign_verse.startswith('<wers'):
125                             new_foreign += foreign_verse
126                         else:
127                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
128                     set_inner_xml(child_node, new_foreign)
129         verses = inner_xml(node).split('/\n')
130         if len(verses) > 1:
131             modified_inner_xml = ''
132             for verse in verses:
133                 if verse.startswith('<wers') or verse.startswith('<extra'):
134                     modified_inner_xml += verse
135                 else:
136                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
137             set_inner_xml(node, modified_inner_xml)
138
139
140 def add_to_manifest(manifest, partno):
141     """ Adds a node to the manifest section in content.opf file """
142
143     partstr = 'part%d' % partno
144     e = manifest.makeelement(OPFNS('item'), attrib={
145                                  'id': partstr,
146                                  'href': partstr + '.html',
147                                  'media-type': 'application/xhtml+xml',
148                              })
149     manifest.append(e)
150
151
152 def add_to_spine(spine, partno):
153     """ Adds a node to the spine section in content.opf file """
154
155     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
156     spine.append(e)
157
158
159 class TOC(object):
160     def __init__(self, name=None, part_number=None):
161         self.children = []
162         self.name = name
163         self.part_number = part_number
164         self.sub_number = None
165
166     def add(self, name, part_number, level=0, is_part=True):
167         if level > 0 and self.children:
168             return self.children[-1].add(name, part_number, level-1, is_part)
169         else:
170             t = TOC(name)
171             t.part_number = part_number
172             self.children.append(t)
173             if not is_part:
174                 t.sub_number = len(self.children) + 1
175                 return t.sub_number
176
177     def append(self, toc):
178         self.children.append(toc)
179
180     def extend(self, toc):
181         self.children.extend(toc.children)
182
183     def depth(self):
184         if self.children:
185             return max((c.depth() for c in self.children)) + 1
186         else:
187             return 0
188
189     def write_to_xml(self, nav_map, counter):
190         for child in self.children:
191             nav_point = nav_map.makeelement(NCXNS('navPoint'))
192             nav_point.set('id', 'NavPoint-%d' % counter)
193             nav_point.set('playOrder', str(counter))
194
195             nav_label = nav_map.makeelement(NCXNS('navLabel'))
196             text = nav_map.makeelement(NCXNS('text'))
197             text.text = child.name
198             nav_label.append(text)
199             nav_point.append(nav_label)
200
201             content = nav_map.makeelement(NCXNS('content'))
202             src = 'part%d.html' % child.part_number
203             if child.sub_number is not None:
204                 src += '#sub%d' % child.sub_number
205             content.set('src', src)
206             nav_point.append(content)
207             nav_map.append(nav_point)
208             counter = child.write_to_xml(nav_point, counter + 1)
209         return counter
210
211
212 def used_chars(element):
213     """ Lists characters used in an ETree Element """
214     chars = set((element.text or '') + (element.tail or ''))
215     for child in element:
216         chars = chars.union(used_chars(child))
217     return chars
218
219
220 def chop(main_text):
221     """ divide main content of the XML file into chunks """
222
223     # prepare a container for each chunk
224     part_xml = etree.Element('utwor')
225     etree.SubElement(part_xml, 'master')
226     main_xml_part = part_xml[0] # master
227
228     last_node_part = False
229     for one_part in main_text:
230         name = one_part.tag
231         if name == 'naglowek_czesc':
232             yield part_xml
233             last_node_part = True
234             main_xml_part[:] = [deepcopy(one_part)]
235         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
236             yield part_xml
237             main_xml_part[:] = [deepcopy(one_part)]
238         else:
239             main_xml_part.append(deepcopy(one_part))
240             last_node_part = False
241     yield part_xml
242
243
244 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
245     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
246
247     toc = TOC()
248     for element in chunk_xml[0]:
249         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
250             toc.add(node_name(element), chunk_no)
251         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
252             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
253             element.set('sub', str(subnumber))
254     if empty:
255         if not _empty_html_static:
256             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
257         chars = set()
258         output_html = _empty_html_static[0]
259     else:
260         find_annotations(annotations, chunk_xml, chunk_no)
261         replace_by_verse(chunk_xml)
262         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
263         chars = used_chars(html_tree.getroot())
264         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
265     return output_html, toc, chars
266
267
268 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None, cover_fn=None):
269     """ produces a EPUB file
270
271     provider: a DocProvider
272     slug: slug of file to process, available by provider
273     output_file: file-like object or path to output file
274     output_dir: path to directory to save output file to; either this or output_file must be present
275     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
276     sample=n: generate sample e-book (with at least n paragraphs)
277     cover_fn: function(author, title) -> cover image
278     """
279
280     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
281         """ processes one input file and proceeds to its children """
282
283         replace_characters(input_xml.getroot())
284
285         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
286
287         # every input file will have a TOC entry,
288         # pointing to starting chunk
289         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
290         chars = set()
291         if first:
292             # write book title page
293             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
294             chars = used_chars(html_tree.getroot())
295             zip.writestr('OPS/title.html',
296                  etree.tostring(html_tree, method="html", pretty_print=True))
297         elif children:
298             # write title page for every parent
299             if sample is not None and sample <= 0:
300                 chars = set()
301                 html_string = open(get_resource('epub/emptyChunk.html')).read()
302             else:
303                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
304                 chars = used_chars(html_tree.getroot())
305                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
306             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
307             add_to_manifest(manifest, chunk_counter)
308             add_to_spine(spine, chunk_counter)
309             chunk_counter += 1
310
311         if len(input_xml.getroot()) > 1:
312             # rdf before style master
313             main_text = input_xml.getroot()[1]
314         else:
315             # rdf in style master
316             main_text = input_xml.getroot()[0]
317             if main_text.tag == RDFNS('RDF'):
318                 main_text = None
319
320         if main_text is not None:
321             for chunk_xml in chop(main_text):
322                 empty = False
323                 if sample is not None:
324                     if sample <= 0:
325                         empty = True
326                     else:
327                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
328                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
329
330                 toc.extend(chunk_toc)
331                 chars = chars.union(chunk_chars)
332                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
333                 add_to_manifest(manifest, chunk_counter)
334                 add_to_spine(spine, chunk_counter)
335                 chunk_counter += 1
336
337         if children:
338             for child in children:
339                 child_xml = etree.parse(provider.by_uri(child))
340                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
341                 toc.append(child_toc)
342                 chars = chars.union(chunk_chars)
343
344         return toc, chunk_counter, chars, sample
345
346     # read metadata from the first file
347     if file_path:
348         if slug:
349             raise ValueError('slug or file_path should be specified, not both')
350         f = open(file_path, 'r')
351         input_xml = etree.parse(f)
352         f.close()
353     else:
354         if not slug:
355             raise ValueError('either slug or file_path should be specified')
356         input_xml = etree.parse(provider[slug])
357
358     metadata = input_xml.find('.//'+RDFNS('Description'))
359     if metadata is None:
360         raise NoDublinCore('Document has no DublinCore - which is required.')
361     book_info = BookInfo.from_element(input_xml)
362     metadata = etree.ElementTree(metadata)
363
364     # if output to dir, create the file
365     if output_dir is not None:
366         if make_dir:
367             author = unicode(book_info.author)
368             output_dir = os.path.join(output_dir, author)
369             try:
370                 os.makedirs(output_dir)
371             except OSError:
372                 pass
373         if slug:
374             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
375         else:
376             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
377
378     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
379
380     # write static elements
381     mime = zipfile.ZipInfo()
382     mime.filename = 'mimetype'
383     mime.compress_type = zipfile.ZIP_STORED
384     mime.extra = ''
385     zip.writestr(mime, 'application/epub+zip')
386     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
387                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
388                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
389                        'media-type="application/oebps-package+xml" />' \
390                        '</rootfiles></container>')
391     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
392     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
393
394     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
395     manifest = opf.find('.//' + OPFNS('manifest'))
396     spine = opf.find('.//' + OPFNS('spine'))
397
398     if cover_fn:
399         cover = StringIO()
400         cover_fn(book_info.author.readable(), book_info.title).save(cover, format='JPEG')
401         zip.writestr(os.path.join('OPS', 'cover.jpg'), cover.getvalue())
402         del cover
403         zip.writestr('OPS/cover.html', open(get_resource('epub/cover.html')).read())
404         manifest.append(etree.fromstring(
405             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
406         manifest.append(etree.fromstring(
407             '<item id="cover-image" href="cover.jpg" media-type="image/jpeg" />'))
408         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
409         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
410         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
411
412
413     annotations = etree.Element('annotations')
414
415     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
416                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
417                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
418                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
419                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
420                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
421                                '</navPoint></navMap></ncx>')
422     nav_map = toc_file[-1]
423
424     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
425
426     if not toc.children:
427         toc.add(u"Początek utworu", 1)
428     toc_counter = toc.write_to_xml(nav_map, 2)
429
430     # Last modifications in container files and EPUB creation
431     if len(annotations) > 0:
432         nav_map.append(etree.fromstring(
433             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
434             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
435         toc_counter += 1
436         manifest.append(etree.fromstring(
437             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
438         spine.append(etree.fromstring(
439             '<itemref idref="annotations" />'))
440         replace_by_verse(annotations)
441         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
442         chars = chars.union(used_chars(html_tree.getroot()))
443         zip.writestr('OPS/annotations.html', etree.tostring(
444                             html_tree, method="html", pretty_print=True))
445
446     nav_map.append(etree.fromstring(
447         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
448         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
449     manifest.append(etree.fromstring(
450         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
451     spine.append(etree.fromstring(
452         '<itemref idref="last" />'))
453     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
454     chars.update(used_chars(html_tree.getroot()))
455     zip.writestr('OPS/last.html', etree.tostring(
456                         html_tree, method="html", pretty_print=True))
457
458     # strip fonts
459     tmpdir = mkdtemp('-librarian-epub')
460     cwd = os.getcwd()
461
462     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
463     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
464         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), 
465                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
466         if verbose:
467             print "Running font-optimizer"
468             subprocess.check_call(optimizer_call)
469         else:
470             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
471         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
472     rmtree(tmpdir)
473     os.chdir(cwd)
474
475     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
476     contents = []
477     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
478     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
479     for st in attributes:
480         meta = toc_file.makeelement(NCXNS('meta'))
481         meta.set('name', st)
482         meta.set('content', '0')
483         toc_file[0].append(meta)
484     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
485     toc_file[0][1].set('content', str(toc.depth()))
486     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
487     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
488     zip.close()