encode unicode for lxml
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15 from tempfile import mkdtemp
16 from shutil import rmtree
17
18 import sys
19
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
21 from librarian.dcparser import BookInfo
22
23 from librarian import functions, get_resource
24
25 functions.reg_person_name()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node])
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace(u"\ufeff", u"")\
82                    .replace("---", u"\u2014")\
83                    .replace("--", u"\u2013")\
84                    .replace(",,", u"\u201E")\
85                    .replace('"', u"\u201D")\
86                    .replace("'", u"\u2019")
87     if node.tag in ('uwaga', 'extra'):
88         t = node.tail
89         node.clear()
90         node.tail = t
91     node.text = replace_chars(node.text)
92     node.tail = replace_chars(node.tail)
93     for child in node:
94         replace_characters(child)
95
96
97 def find_annotations(annotations, source, part_no):
98     for child in source:
99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
100             annotation = deepcopy(child)
101             number = str(len(annotations)+1)
102             annotation.set('number', number)
103             annotation.set('part', str(part_no))
104             annotation.tail = ''
105             annotations.append(annotation)
106             tail = child.tail
107             child.clear()
108             child.tail = tail
109             child.text = number
110         if child.tag not in ('extra', 'uwaga'):
111             find_annotations(annotations, child, part_no)
112
113
114 def replace_by_verse(tree):
115     """ Find stanzas and create new verses in place of a '/' character """
116
117     stanzas = tree.findall('.//' + WLNS('strofa'))
118     for node in stanzas:
119         for child_node in node:
120             if child_node.tag in ('slowo_obce', 'wyroznienie'):
121                 foreign_verses = inner_xml(child_node).split('/\n')
122                 if len(foreign_verses) > 1:
123                     new_foreign = ''
124                     for foreign_verse in foreign_verses:
125                         if foreign_verse.startswith('<wers'):
126                             new_foreign += foreign_verse
127                         else:
128                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
129                     set_inner_xml(child_node, new_foreign)
130         verses = inner_xml(node).split('/\n')
131         if len(verses) > 1:
132             modified_inner_xml = ''
133             for verse in verses:
134                 if verse.startswith('<wers') or verse.startswith('<extra'):
135                     modified_inner_xml += verse
136                 else:
137                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
138             set_inner_xml(node, modified_inner_xml)
139
140
141 def add_to_manifest(manifest, partno):
142     """ Adds a node to the manifest section in content.opf file """
143
144     partstr = 'part%d' % partno
145     e = manifest.makeelement(OPFNS('item'), attrib={
146                                  'id': partstr,
147                                  'href': partstr + '.html',
148                                  'media-type': 'application/xhtml+xml',
149                              })
150     manifest.append(e)
151
152
153 def add_to_spine(spine, partno):
154     """ Adds a node to the spine section in content.opf file """
155
156     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
157     spine.append(e)
158
159
160 class TOC(object):
161     def __init__(self, name=None, part_number=None):
162         self.children = []
163         self.name = name
164         self.part_number = part_number
165         self.sub_number = None
166
167     def add(self, name, part_number, level=0, is_part=True):
168         if level > 0 and self.children:
169             return self.children[-1].add(name, part_number, level-1, is_part)
170         else:
171             t = TOC(name)
172             t.part_number = part_number
173             self.children.append(t)
174             if not is_part:
175                 t.sub_number = len(self.children) + 1
176                 return t.sub_number
177
178     def append(self, toc):
179         self.children.append(toc)
180
181     def extend(self, toc):
182         self.children.extend(toc.children)
183
184     def depth(self):
185         if self.children:
186             return max((c.depth() for c in self.children)) + 1
187         else:
188             return 0
189
190     def write_to_xml(self, nav_map, counter):
191         for child in self.children:
192             nav_point = nav_map.makeelement(NCXNS('navPoint'))
193             nav_point.set('id', 'NavPoint-%d' % counter)
194             nav_point.set('playOrder', str(counter))
195
196             nav_label = nav_map.makeelement(NCXNS('navLabel'))
197             text = nav_map.makeelement(NCXNS('text'))
198             text.text = child.name
199             nav_label.append(text)
200             nav_point.append(nav_label)
201
202             content = nav_map.makeelement(NCXNS('content'))
203             src = 'part%d.html' % child.part_number
204             if child.sub_number is not None:
205                 src += '#sub%d' % child.sub_number
206             content.set('src', src)
207             nav_point.append(content)
208             nav_map.append(nav_point)
209             counter = child.write_to_xml(nav_point, counter + 1)
210         return counter
211
212
213 def used_chars(element):
214     """ Lists characters used in an ETree Element """
215     chars = set((element.text or '') + (element.tail or ''))
216     for child in element:
217         chars = chars.union(used_chars(child))
218     return chars
219
220
221 def chop(main_text):
222     """ divide main content of the XML file into chunks """
223
224     # prepare a container for each chunk
225     part_xml = etree.Element('utwor')
226     etree.SubElement(part_xml, 'master')
227     main_xml_part = part_xml[0] # master
228
229     last_node_part = False
230     for one_part in main_text:
231         name = one_part.tag
232         if name == 'naglowek_czesc':
233             yield part_xml
234             last_node_part = True
235             main_xml_part[:] = [deepcopy(one_part)]
236         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
237             yield part_xml
238             main_xml_part[:] = [deepcopy(one_part)]
239         else:
240             main_xml_part.append(deepcopy(one_part))
241             last_node_part = False
242     yield part_xml
243
244
245 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
246     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
247
248     toc = TOC()
249     for element in chunk_xml[0]:
250         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
251             toc.add(node_name(element), chunk_no)
252         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
253             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
254             element.set('sub', str(subnumber))
255     if empty:
256         if not _empty_html_static:
257             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
258         chars = set()
259         output_html = _empty_html_static[0]
260     else:
261         find_annotations(annotations, chunk_xml, chunk_no)
262         replace_by_verse(chunk_xml)
263         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
264         chars = used_chars(html_tree.getroot())
265         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
266     return output_html, toc, chars
267
268
269 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
270               sample=None, cover=None, flags=None):
271     """ produces a EPUB file
272
273     provider: a DocProvider
274     slug: slug of file to process, available by provider
275     output_file: file-like object or path to output file
276     output_dir: path to directory to save output file to; either this or output_file must be present
277     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
278     sample=n: generate sample e-book (with at least n paragraphs)
279     cover: a cover.Cover object
280     flags: less-advertising,
281     """
282
283     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
284         """ processes one input file and proceeds to its children """
285
286         replace_characters(input_xml.getroot())
287
288         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
289
290         # every input file will have a TOC entry,
291         # pointing to starting chunk
292         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
293         chars = set()
294         if first:
295             # write book title page
296             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
297             chars = used_chars(html_tree.getroot())
298             zip.writestr('OPS/title.html',
299                  etree.tostring(html_tree, method="html", pretty_print=True))
300         elif children:
301             # write title page for every parent
302             if sample is not None and sample <= 0:
303                 chars = set()
304                 html_string = open(get_resource('epub/emptyChunk.html')).read()
305             else:
306                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
307                 chars = used_chars(html_tree.getroot())
308                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
309             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
310             add_to_manifest(manifest, chunk_counter)
311             add_to_spine(spine, chunk_counter)
312             chunk_counter += 1
313
314         if len(input_xml.getroot()) > 1:
315             # rdf before style master
316             main_text = input_xml.getroot()[1]
317         else:
318             # rdf in style master
319             main_text = input_xml.getroot()[0]
320             if main_text.tag == RDFNS('RDF'):
321                 main_text = None
322
323         if main_text is not None:
324             for chunk_xml in chop(main_text):
325                 empty = False
326                 if sample is not None:
327                     if sample <= 0:
328                         empty = True
329                     else:
330                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
331                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
332
333                 toc.extend(chunk_toc)
334                 chars = chars.union(chunk_chars)
335                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
336                 add_to_manifest(manifest, chunk_counter)
337                 add_to_spine(spine, chunk_counter)
338                 chunk_counter += 1
339
340         if children:
341             for child in children:
342                 child_xml = etree.parse(provider.by_uri(child))
343                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
344                 toc.append(child_toc)
345                 chars = chars.union(chunk_chars)
346
347         return toc, chunk_counter, chars, sample
348
349     # read metadata from the first file
350     if file_path:
351         if slug:
352             raise ValueError('slug or file_path should be specified, not both')
353         f = open(file_path, 'r')
354         input_xml = etree.parse(f)
355         f.close()
356     else:
357         if not slug:
358             raise ValueError('either slug or file_path should be specified')
359         input_xml = etree.parse(provider[slug])
360
361     if flags:
362         for flag in flags:
363             input_xml.getroot().set(flag, 'yes')
364
365     metadata = input_xml.find('.//'+RDFNS('Description'))
366     if metadata is None:
367         raise NoDublinCore('Document has no DublinCore - which is required.')
368     book_info = BookInfo.from_element(input_xml)
369     metadata = etree.ElementTree(metadata)
370
371     # if output to dir, create the file
372     if output_dir is not None:
373         if make_dir:
374             author = unicode(book_info.author)
375             output_dir = os.path.join(output_dir, author)
376             try:
377                 os.makedirs(output_dir)
378             except OSError:
379                 pass
380         if slug:
381             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
382         else:
383             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
384
385     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
386
387     # write static elements
388     mime = zipfile.ZipInfo()
389     mime.filename = 'mimetype'
390     mime.compress_type = zipfile.ZIP_STORED
391     mime.extra = ''
392     zip.writestr(mime, 'application/epub+zip')
393     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
394                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
395                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
396                        'media-type="application/oebps-package+xml" />' \
397                        '</rootfiles></container>')
398     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
399     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
400
401     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
402     manifest = opf.find('.//' + OPFNS('manifest'))
403     spine = opf.find('.//' + OPFNS('spine'))
404
405     if cover:
406         cover_file = StringIO()
407         c = cover(book_info.author.readable(), book_info.title)
408         c.save(cover_file)
409         c_name = 'cover.%s' % c.ext()
410         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
411         del cover_file
412
413         cover_tree = etree.parse(get_resource('epub/cover.html'))
414         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
415         zip.writestr('OPS/cover.html', etree.tostring(
416                         cover_tree, method="html", pretty_print=True))
417
418         manifest.append(etree.fromstring(
419             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
420         manifest.append(etree.fromstring(
421             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
422         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
423         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
424         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
425
426
427     annotations = etree.Element('annotations')
428
429     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
430                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
431                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
432                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
433                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
434                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
435                                '</navPoint></navMap></ncx>')
436     nav_map = toc_file[-1]
437
438     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
439
440     if not toc.children:
441         toc.add(u"Początek utworu", 1)
442     toc_counter = toc.write_to_xml(nav_map, 2)
443
444     # Last modifications in container files and EPUB creation
445     if len(annotations) > 0:
446         nav_map.append(etree.fromstring(
447             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
448             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
449         toc_counter += 1
450         manifest.append(etree.fromstring(
451             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
452         spine.append(etree.fromstring(
453             '<itemref idref="annotations" />'))
454         replace_by_verse(annotations)
455         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
456         chars = chars.union(used_chars(html_tree.getroot()))
457         zip.writestr('OPS/annotations.html', etree.tostring(
458                             html_tree, method="html", pretty_print=True))
459
460     nav_map.append(etree.fromstring(
461         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
462         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
463     manifest.append(etree.fromstring(
464         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
465     spine.append(etree.fromstring(
466         '<itemref idref="last" />'))
467     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
468     chars.update(used_chars(html_tree.getroot()))
469     zip.writestr('OPS/last.html', etree.tostring(
470                         html_tree, method="html", pretty_print=True))
471
472     # strip fonts
473     tmpdir = mkdtemp('-librarian-epub')
474     cwd = os.getcwd()
475
476     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
477     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
478         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
479                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
480         if verbose:
481             print "Running font-optimizer"
482             subprocess.check_call(optimizer_call)
483         else:
484             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
485         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
486     rmtree(tmpdir)
487     os.chdir(cwd)
488
489     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
490     contents = []
491     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
492     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
493     for st in attributes:
494         meta = toc_file.makeelement(NCXNS('meta'))
495         meta.set('name', st)
496         meta.set('content', '0')
497         toc_file[0].append(meta)
498     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
499     toc_file[0][1].set('content', str(toc.depth()))
500     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
501     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
502     zip.close()