format change
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 from copy import deepcopy
9 import os
10 import os.path
11 import subprocess
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
15 import zipfile
16 from tempfile import mkdtemp
17 from shutil import rmtree
18
19 import sys
20
21 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
22 from librarian.dcparser import BookInfo
23 from librarian.cover import ImageCover
24
25 from librarian import functions, get_resource
26
27 functions.reg_person_name()
28
29
30 def inner_xml(node):
31     """ returns node's text and children as a string
32
33     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34     x<b>y</b>z
35     """
36
37     nt = node.text if node.text is not None else ''
38     return ''.join([nt] + [etree.tostring(child) for child in node])
39
40 def set_inner_xml(node, text):
41     """ sets node's text and children from a string
42
43     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
44     >>> set_inner_xml(e, 'x<b>y</b>z')
45     >>> print etree.tostring(e)
46     <a>x<b>y</b>z</a>
47     """
48
49     p = etree.fromstring('<x>%s</x>' % text)
50     node.text = p.text
51     node[:] = p[:]
52
53
54 def node_name(node):
55     """ Find out a node's name
56
57     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58     XYZ
59     """
60
61     tempnode = deepcopy(node)
62
63     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
64         for e in tempnode.findall('.//%s' % p):
65             t = e.tail
66             e.clear()
67             e.tail = t
68     etree.strip_tags(tempnode, '*')
69     return tempnode.text
70
71
72 def xslt(xml, sheet):
73     if isinstance(xml, etree._Element):
74         xml = etree.ElementTree(xml)
75     with open(sheet) as xsltf:
76         return xml.xslt(etree.parse(xsltf))
77
78
79 def replace_characters(node):
80     def replace_chars(text):
81         if text is None:
82             return None
83         return text.replace(u"\ufeff", u"")\
84                    .replace("---", u"\u2014")\
85                    .replace("--", u"\u2013")\
86                    .replace(",,", u"\u201E")\
87                    .replace('"', u"\u201D")\
88                    .replace("'", u"\u2019")
89     if node.tag in ('uwaga', 'extra'):
90         t = node.tail
91         node.clear()
92         node.tail = t
93     node.text = replace_chars(node.text)
94     node.tail = replace_chars(node.tail)
95     for child in node:
96         replace_characters(child)
97
98
99 def find_annotations(annotations, source, part_no):
100     for child in source:
101         if child.tag in ('pe', 'pa', 'pt', 'pr'):
102             annotation = deepcopy(child)
103             number = str(len(annotations)+1)
104             annotation.set('number', number)
105             annotation.set('part', str(part_no))
106             annotation.tail = ''
107             annotations.append(annotation)
108             tail = child.tail
109             child.clear()
110             child.tail = tail
111             child.text = number
112         if child.tag not in ('extra', 'uwaga'):
113             find_annotations(annotations, child, part_no)
114
115
116 def replace_by_verse(tree):
117     """ Find stanzas and create new verses in place of a '/' character """
118
119     stanzas = tree.findall('.//' + WLNS('strofa'))
120     for node in stanzas:
121         for child_node in node:
122             if child_node.tag in ('slowo_obce', 'wyroznienie'):
123                 foreign_verses = inner_xml(child_node).split('/\n')
124                 if len(foreign_verses) > 1:
125                     new_foreign = ''
126                     for foreign_verse in foreign_verses:
127                         if foreign_verse.startswith('<wers'):
128                             new_foreign += foreign_verse
129                         else:
130                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
131                     set_inner_xml(child_node, new_foreign)
132         verses = inner_xml(node).split('/\n')
133         if len(verses) > 1:
134             modified_inner_xml = ''
135             for verse in verses:
136                 if verse.startswith('<wers') or verse.startswith('<extra'):
137                     modified_inner_xml += verse
138                 else:
139                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
140             set_inner_xml(node, modified_inner_xml)
141
142
143 def add_to_manifest(manifest, partno):
144     """ Adds a node to the manifest section in content.opf file """
145
146     partstr = 'part%d' % partno
147     e = manifest.makeelement(OPFNS('item'), attrib={
148                                  'id': partstr,
149                                  'href': partstr + '.html',
150                                  'media-type': 'application/xhtml+xml',
151                              })
152     manifest.append(e)
153
154
155 def add_to_spine(spine, partno):
156     """ Adds a node to the spine section in content.opf file """
157
158     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
159     spine.append(e)
160
161
162 class TOC(object):
163     def __init__(self, name=None, part_number=None):
164         self.children = []
165         self.name = name
166         self.part_number = part_number
167         self.sub_number = None
168
169     def add(self, name, part_number, level=0, is_part=True):
170         if level > 0 and self.children:
171             return self.children[-1].add(name, part_number, level-1, is_part)
172         else:
173             t = TOC(name)
174             t.part_number = part_number
175             self.children.append(t)
176             if not is_part:
177                 t.sub_number = len(self.children) + 1
178                 return t.sub_number
179
180     def append(self, toc):
181         self.children.append(toc)
182
183     def extend(self, toc):
184         self.children.extend(toc.children)
185
186     def depth(self):
187         if self.children:
188             return max((c.depth() for c in self.children)) + 1
189         else:
190             return 0
191
192     def write_to_xml(self, nav_map, counter):
193         for child in self.children:
194             nav_point = nav_map.makeelement(NCXNS('navPoint'))
195             nav_point.set('id', 'NavPoint-%d' % counter)
196             nav_point.set('playOrder', str(counter))
197
198             nav_label = nav_map.makeelement(NCXNS('navLabel'))
199             text = nav_map.makeelement(NCXNS('text'))
200             text.text = child.name
201             nav_label.append(text)
202             nav_point.append(nav_label)
203
204             content = nav_map.makeelement(NCXNS('content'))
205             src = 'part%d.html' % child.part_number
206             if child.sub_number is not None:
207                 src += '#sub%d' % child.sub_number
208             content.set('src', src)
209             nav_point.append(content)
210             nav_map.append(nav_point)
211             counter = child.write_to_xml(nav_point, counter + 1)
212         return counter
213
214
215 def used_chars(element):
216     """ Lists characters used in an ETree Element """
217     chars = set((element.text or '') + (element.tail or ''))
218     for child in element:
219         chars = chars.union(used_chars(child))
220     return chars
221
222
223 def chop(main_text):
224     """ divide main content of the XML file into chunks """
225
226     # prepare a container for each chunk
227     part_xml = etree.Element('utwor')
228     etree.SubElement(part_xml, 'master')
229     main_xml_part = part_xml[0] # master
230
231     last_node_part = False
232     for one_part in main_text:
233         name = one_part.tag
234         if name == 'naglowek_czesc':
235             yield part_xml
236             last_node_part = True
237             main_xml_part[:] = [deepcopy(one_part)]
238         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
239             yield part_xml
240             main_xml_part[:] = [deepcopy(one_part)]
241         else:
242             main_xml_part.append(deepcopy(one_part))
243             last_node_part = False
244     yield part_xml
245
246
247 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
248     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
249
250     toc = TOC()
251     for element in chunk_xml[0]:
252         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
253             toc.add(node_name(element), chunk_no)
254         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
255             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
256             element.set('sub', str(subnumber))
257     if empty:
258         if not _empty_html_static:
259             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
260         chars = set()
261         output_html = _empty_html_static[0]
262     else:
263         find_annotations(annotations, chunk_xml, chunk_no)
264         replace_by_verse(chunk_xml)
265         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
266         chars = used_chars(html_tree.getroot())
267         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
268     return output_html, toc, chars
269
270
271 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
272               sample=None, cover=None, flags=None):
273     """ produces a EPUB file
274
275     provider: a DocProvider
276     slug: slug of file to process, available by provider
277     output_file: file-like object or path to output file
278     output_dir: path to directory to save output file to; either this or output_file must be present
279     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
280     sample=n: generate sample e-book (with at least n paragraphs)
281     cover: a cover.Cover object
282     flags: less-advertising, images, not-wl
283     """
284
285     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
286         """ processes one input file and proceeds to its children """
287
288         replace_characters(input_xml.getroot())
289
290         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
291
292         # every input file will have a TOC entry,
293         # pointing to starting chunk
294         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
295         chars = set()
296         if first:
297             # write book title page
298             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
299             chars = used_chars(html_tree.getroot())
300             zip.writestr('OPS/title.html',
301                  etree.tostring(html_tree, method="html", pretty_print=True))
302         elif children:
303             # write title page for every parent
304             if sample is not None and sample <= 0:
305                 chars = set()
306                 html_string = open(get_resource('epub/emptyChunk.html')).read()
307             else:
308                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
309                 chars = used_chars(html_tree.getroot())
310                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
311             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
312             add_to_manifest(manifest, chunk_counter)
313             add_to_spine(spine, chunk_counter)
314             chunk_counter += 1
315
316         if len(input_xml.getroot()) > 1:
317             # rdf before style master
318             main_text = input_xml.getroot()[1]
319         else:
320             # rdf in style master
321             main_text = input_xml.getroot()[0]
322             if main_text.tag == RDFNS('RDF'):
323                 main_text = None
324
325         if main_text is not None:
326             for chunk_xml in chop(main_text):
327                 empty = False
328                 if sample is not None:
329                     if sample <= 0:
330                         empty = True
331                     else:
332                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
333                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
334
335                 toc.extend(chunk_toc)
336                 chars = chars.union(chunk_chars)
337                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
338                 add_to_manifest(manifest, chunk_counter)
339                 add_to_spine(spine, chunk_counter)
340                 chunk_counter += 1
341
342         if children:
343             for child in children:
344                 child_xml = etree.parse(provider.by_uri(child))
345                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
346                 toc.append(child_toc)
347                 chars = chars.union(chunk_chars)
348
349         return toc, chunk_counter, chars, sample
350
351     # read metadata from the first file
352     if file_path:
353         if slug:
354             raise ValueError('slug or file_path should be specified, not both')
355         f = open(file_path, 'r')
356         input_xml = etree.parse(f)
357         f.close()
358     else:
359         if not slug:
360             raise ValueError('either slug or file_path should be specified')
361         input_xml = etree.parse(provider[slug])
362
363     if flags:
364         for flag in flags:
365             input_xml.getroot().set(flag, 'yes')
366
367     metadata = input_xml.find('.//'+RDFNS('Description'))
368     if metadata is None:
369         raise NoDublinCore('Document has no DublinCore - which is required.')
370     book_info = BookInfo.from_element(input_xml)
371     metadata = etree.ElementTree(metadata)
372
373     # if output to dir, create the file
374     if output_dir is not None:
375         if make_dir:
376             author = unicode(book_info.author)
377             output_dir = os.path.join(output_dir, author)
378             try:
379                 os.makedirs(output_dir)
380             except OSError:
381                 pass
382         if slug:
383             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
384         else:
385             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
386
387     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
388     manifest = opf.find('.//' + OPFNS('manifest'))
389     spine = opf.find('.//' + OPFNS('spine'))
390
391     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
392
393     # write static elements
394     mime = zipfile.ZipInfo()
395     mime.filename = 'mimetype'
396     mime.compress_type = zipfile.ZIP_STORED
397     mime.extra = ''
398     zip.writestr(mime, 'application/epub+zip')
399     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
400                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
401                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
402                        'media-type="application/oebps-package+xml" />' \
403                        '</rootfiles></container>')
404     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
405     if not flags or 'not-wl' not in flags:
406         manifest.append(etree.fromstring(
407             '<item id="logo_wolnelektury" href="logo_wolnelektury.png" media-type="image/png" />'))
408         zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
409
410     if cover:
411         cover_file = StringIO()
412         c = cover(book_info.author.readable(), book_info.title)
413         c.save(cover_file)
414         c_name = 'cover.%s' % c.ext()
415         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
416         del cover_file
417
418         cover_tree = etree.parse(get_resource('epub/cover.html'))
419         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
420         zip.writestr('OPS/cover.html', etree.tostring(
421                         cover_tree, method="html", pretty_print=True))
422
423         manifest.append(etree.fromstring(
424             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
425         manifest.append(etree.fromstring(
426             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
427         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
428         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
429         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
430
431     if flags and 'images' in flags:
432         for ilustr in input_xml.findall('//ilustr'):
433             src = ilustr.get('src')
434             mime = ImageCover(src)().mime_type()
435             zip.write(src, os.path.join('OPS', src))
436             manifest.append(etree.fromstring(
437                 '<item id="%s" href="%s" media-type="%s" />' % (src, src, mime)))
438             # get it up to master
439             after = ilustr
440             while after.getparent().tag not in ['powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp', 'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny']:
441                 after = after.getparent()
442             if not(after is ilustr):
443                 moved = deepcopy(ilustr)
444                 ilustr.tag = 'extra'
445                 ilustr.text = None
446                 moved.tail = None
447                 after.addnext(moved)
448     else:
449         for ilustr in input_xml.findall('//ilustr'):
450             ilustr.tag = 'extra'
451
452     annotations = etree.Element('annotations')
453
454     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
455                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
456                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
457                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
458                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
459                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
460                                '</navPoint></navMap></ncx>')
461     nav_map = toc_file[-1]
462
463     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
464
465     if not toc.children:
466         toc.add(u"Początek utworu", 1)
467     toc_counter = toc.write_to_xml(nav_map, 2)
468
469     # Last modifications in container files and EPUB creation
470     if len(annotations) > 0:
471         nav_map.append(etree.fromstring(
472             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
473             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
474         toc_counter += 1
475         manifest.append(etree.fromstring(
476             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
477         spine.append(etree.fromstring(
478             '<itemref idref="annotations" />'))
479         replace_by_verse(annotations)
480         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
481         chars = chars.union(used_chars(html_tree.getroot()))
482         zip.writestr('OPS/annotations.html', etree.tostring(
483                             html_tree, method="html", pretty_print=True))
484
485     nav_map.append(etree.fromstring(
486         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
487         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
488     manifest.append(etree.fromstring(
489         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
490     spine.append(etree.fromstring(
491         '<itemref idref="last" />'))
492     stopka = input_xml.find('//stopka')
493     if stopka is not None:
494         stopka.tag = 'stopka_'
495         replace_by_verse(stopka)
496         html_tree = xslt(stopka, get_resource('epub/xsltScheme.xsl'))
497     else:
498         html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
499     chars.update(used_chars(html_tree.getroot()))
500     zip.writestr('OPS/last.html', etree.tostring(
501                         html_tree, method="html", pretty_print=True))
502
503     # strip fonts
504     tmpdir = mkdtemp('-librarian-epub')
505     cwd = os.getcwd()
506
507     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
508     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
509         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
510                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
511         if verbose:
512             print "Running font-optimizer"
513             subprocess.check_call(optimizer_call)
514         else:
515             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
516         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
517     rmtree(tmpdir)
518     os.chdir(cwd)
519
520     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
521     contents = []
522     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
523     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
524     for st in attributes:
525         meta = toc_file.makeelement(NCXNS('meta'))
526         meta.set('name', st)
527         meta.set('content', '0')
528         toc_file[0].append(meta)
529     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
530     toc_file[0][1].set('content', str(toc.depth()))
531     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
532     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
533     zip.close()