Add raw text option to text conversion (lesmianator will need it).
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15 from tempfile import mkdtemp
16 from shutil import rmtree
17
18 import sys
19
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
21 from librarian.dcparser import BookInfo
22
23 from librarian import functions, get_resource
24
25 functions.reg_person_name()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node])
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace(u"\ufeff", u"")\
82                    .replace("---", u"\u2014")\
83                    .replace("--", u"\u2013")\
84                    .replace(",,", u"\u201E")\
85                    .replace('"', u"\u201D")\
86                    .replace("'", u"\u2019")
87     if node.tag == 'extra':
88         node.clear()
89     else:
90         node.text = replace_chars(node.text)
91         node.tail = replace_chars(node.tail)
92         for child in node:
93             replace_characters(child)
94
95
96 def find_annotations(annotations, source, part_no):
97     for child in source:
98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
99             annotation = deepcopy(child)
100             number = str(len(annotations)+1)
101             annotation.set('number', number)
102             annotation.set('part', str(part_no))
103             annotation.tail = ''
104             annotations.append(annotation)
105             tail = child.tail
106             child.clear()
107             child.tail = tail
108             child.text = number
109         if child.tag not in ('extra',):
110             find_annotations(annotations, child, part_no)
111
112
113 def replace_by_verse(tree):
114     """ Find stanzas and create new verses in place of a '/' character """
115
116     stanzas = tree.findall('.//' + WLNS('strofa'))
117     for node in stanzas:
118         for child_node in node:
119             if child_node.tag in ('slowo_obce', 'wyroznienie'):
120                 foreign_verses = inner_xml(child_node).split('/\n')
121                 if len(foreign_verses) > 1:
122                     new_foreign = ''
123                     for foreign_verse in foreign_verses:
124                         if foreign_verse.startswith('<wers'):
125                             new_foreign += foreign_verse
126                         else:
127                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
128                     set_inner_xml(child_node, new_foreign)
129         verses = inner_xml(node).split('/\n')
130         if len(verses) > 1:
131             modified_inner_xml = ''
132             for verse in verses:
133                 if verse.startswith('<wers') or verse.startswith('<extra'):
134                     modified_inner_xml += verse
135                 else:
136                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
137             set_inner_xml(node, modified_inner_xml)
138
139
140 def add_to_manifest(manifest, partno):
141     """ Adds a node to the manifest section in content.opf file """
142
143     partstr = 'part%d' % partno
144     e = manifest.makeelement(OPFNS('item'), attrib={
145                                  'id': partstr,
146                                  'href': partstr + '.html',
147                                  'media-type': 'application/xhtml+xml',
148                              })
149     manifest.append(e)
150
151
152 def add_to_spine(spine, partno):
153     """ Adds a node to the spine section in content.opf file """
154
155     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
156     spine.append(e)
157
158
159 class TOC(object):
160     def __init__(self, name=None, part_number=None):
161         self.children = []
162         self.name = name
163         self.part_number = part_number
164         self.sub_number = None
165
166     def add(self, name, part_number, level=0, is_part=True):
167         if level > 0 and self.children:
168             return self.children[-1].add(name, part_number, level-1, is_part)
169         else:
170             t = TOC(name)
171             t.part_number = part_number
172             self.children.append(t)
173             if not is_part:
174                 t.sub_number = len(self.children) + 1
175                 return t.sub_number
176
177     def append(self, toc):
178         self.children.append(toc)
179
180     def extend(self, toc):
181         self.children.extend(toc.children)
182
183     def depth(self):
184         if self.children:
185             return max((c.depth() for c in self.children)) + 1
186         else:
187             return 0
188
189     def write_to_xml(self, nav_map, counter):
190         for child in self.children:
191             nav_point = nav_map.makeelement(NCXNS('navPoint'))
192             nav_point.set('id', 'NavPoint-%d' % counter)
193             nav_point.set('playOrder', str(counter))
194
195             nav_label = nav_map.makeelement(NCXNS('navLabel'))
196             text = nav_map.makeelement(NCXNS('text'))
197             text.text = child.name
198             nav_label.append(text)
199             nav_point.append(nav_label)
200
201             content = nav_map.makeelement(NCXNS('content'))
202             src = 'part%d.html' % child.part_number
203             if child.sub_number is not None:
204                 src += '#sub%d' % child.sub_number
205             content.set('src', src)
206             nav_point.append(content)
207             nav_map.append(nav_point)
208             counter = child.write_to_xml(nav_point, counter + 1)
209         return counter
210
211
212 def used_chars(element):
213     """ Lists characters used in an ETree Element """
214     chars = set((element.text or '') + (element.tail or ''))
215     for child in element:
216         chars = chars.union(used_chars(child))
217     return chars
218
219
220 def chop(main_text):
221     """ divide main content of the XML file into chunks """
222
223     # prepare a container for each chunk
224     part_xml = etree.Element('utwor')
225     etree.SubElement(part_xml, 'master')
226     main_xml_part = part_xml[0] # master
227
228     last_node_part = False
229     for one_part in main_text:
230         name = one_part.tag
231         if name == 'naglowek_czesc':
232             yield part_xml
233             last_node_part = True
234             main_xml_part[:] = [deepcopy(one_part)]
235         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
236             yield part_xml
237             main_xml_part[:] = [deepcopy(one_part)]
238         else:
239             main_xml_part.append(deepcopy(one_part))
240             last_node_part = False
241     yield part_xml
242
243
244 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
245     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
246
247     toc = TOC()
248     for element in chunk_xml[0]:
249         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
250             toc.add(node_name(element), chunk_no)
251         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
252             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
253             element.set('sub', str(subnumber))
254     if empty:
255         if not _empty_html_static:
256             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
257         chars = set()
258         output_html = _empty_html_static[0]
259     else:
260         find_annotations(annotations, chunk_xml, chunk_no)
261         replace_by_verse(chunk_xml)
262         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
263         chars = used_chars(html_tree.getroot())
264         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
265     return output_html, toc, chars
266
267
268 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
269               sample=None, cover=None, flags=None):
270     """ produces a EPUB file
271
272     provider: a DocProvider
273     slug: slug of file to process, available by provider
274     output_file: file-like object or path to output file
275     output_dir: path to directory to save output file to; either this or output_file must be present
276     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
277     sample=n: generate sample e-book (with at least n paragraphs)
278     cover: a cover.Cover object
279     flags: less-advertising,
280     """
281
282     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
283         """ processes one input file and proceeds to its children """
284
285         replace_characters(input_xml.getroot())
286
287         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
288
289         # every input file will have a TOC entry,
290         # pointing to starting chunk
291         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
292         chars = set()
293         if first:
294             # write book title page
295             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
296             chars = used_chars(html_tree.getroot())
297             zip.writestr('OPS/title.html',
298                  etree.tostring(html_tree, method="html", pretty_print=True))
299         elif children:
300             # write title page for every parent
301             if sample is not None and sample <= 0:
302                 chars = set()
303                 html_string = open(get_resource('epub/emptyChunk.html')).read()
304             else:
305                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
306                 chars = used_chars(html_tree.getroot())
307                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
308             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
309             add_to_manifest(manifest, chunk_counter)
310             add_to_spine(spine, chunk_counter)
311             chunk_counter += 1
312
313         if len(input_xml.getroot()) > 1:
314             # rdf before style master
315             main_text = input_xml.getroot()[1]
316         else:
317             # rdf in style master
318             main_text = input_xml.getroot()[0]
319             if main_text.tag == RDFNS('RDF'):
320                 main_text = None
321
322         if main_text is not None:
323             for chunk_xml in chop(main_text):
324                 empty = False
325                 if sample is not None:
326                     if sample <= 0:
327                         empty = True
328                     else:
329                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
330                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
331
332                 toc.extend(chunk_toc)
333                 chars = chars.union(chunk_chars)
334                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
335                 add_to_manifest(manifest, chunk_counter)
336                 add_to_spine(spine, chunk_counter)
337                 chunk_counter += 1
338
339         if children:
340             for child in children:
341                 child_xml = etree.parse(provider.by_uri(child))
342                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
343                 toc.append(child_toc)
344                 chars = chars.union(chunk_chars)
345
346         return toc, chunk_counter, chars, sample
347
348     # read metadata from the first file
349     if file_path:
350         if slug:
351             raise ValueError('slug or file_path should be specified, not both')
352         f = open(file_path, 'r')
353         input_xml = etree.parse(f)
354         f.close()
355     else:
356         if not slug:
357             raise ValueError('either slug or file_path should be specified')
358         input_xml = etree.parse(provider[slug])
359
360     if flags:
361         for flag in flags:
362             input_xml.getroot().set(flag, 'yes')
363
364     metadata = input_xml.find('.//'+RDFNS('Description'))
365     if metadata is None:
366         raise NoDublinCore('Document has no DublinCore - which is required.')
367     book_info = BookInfo.from_element(input_xml)
368     metadata = etree.ElementTree(metadata)
369
370     # if output to dir, create the file
371     if output_dir is not None:
372         if make_dir:
373             author = unicode(book_info.author)
374             output_dir = os.path.join(output_dir, author)
375             try:
376                 os.makedirs(output_dir)
377             except OSError:
378                 pass
379         if slug:
380             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
381         else:
382             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
383
384     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
385
386     # write static elements
387     mime = zipfile.ZipInfo()
388     mime.filename = 'mimetype'
389     mime.compress_type = zipfile.ZIP_STORED
390     mime.extra = ''
391     zip.writestr(mime, 'application/epub+zip')
392     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
393                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
394                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
395                        'media-type="application/oebps-package+xml" />' \
396                        '</rootfiles></container>')
397     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
398     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
399
400     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
401     manifest = opf.find('.//' + OPFNS('manifest'))
402     spine = opf.find('.//' + OPFNS('spine'))
403
404     if cover:
405         cover_file = StringIO()
406         c = cover(book_info.author.readable(), book_info.title)
407         c.save(cover_file)
408         c_name = 'cover.%s' % c.ext()
409         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
410         del cover_file
411
412         cover_tree = etree.parse(get_resource('epub/cover.html'))
413         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
414         zip.writestr('OPS/cover.html', etree.tostring(
415                         cover_tree, method="html", pretty_print=True))
416
417         manifest.append(etree.fromstring(
418             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
419         manifest.append(etree.fromstring(
420             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
421         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
422         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
423         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
424
425
426     annotations = etree.Element('annotations')
427
428     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
429                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
430                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
431                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
432                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
433                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
434                                '</navPoint></navMap></ncx>')
435     nav_map = toc_file[-1]
436
437     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
438
439     if not toc.children:
440         toc.add(u"Początek utworu", 1)
441     toc_counter = toc.write_to_xml(nav_map, 2)
442
443     # Last modifications in container files and EPUB creation
444     if len(annotations) > 0:
445         nav_map.append(etree.fromstring(
446             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
447             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
448         toc_counter += 1
449         manifest.append(etree.fromstring(
450             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
451         spine.append(etree.fromstring(
452             '<itemref idref="annotations" />'))
453         replace_by_verse(annotations)
454         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
455         chars = chars.union(used_chars(html_tree.getroot()))
456         zip.writestr('OPS/annotations.html', etree.tostring(
457                             html_tree, method="html", pretty_print=True))
458
459     nav_map.append(etree.fromstring(
460         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
461         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
462     manifest.append(etree.fromstring(
463         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
464     spine.append(etree.fromstring(
465         '<itemref idref="last" />'))
466     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
467     chars.update(used_chars(html_tree.getroot()))
468     zip.writestr('OPS/last.html', etree.tostring(
469                         html_tree, method="html", pretty_print=True))
470
471     # strip fonts
472     tmpdir = mkdtemp('-librarian-epub')
473     cwd = os.getcwd()
474
475     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
476     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
477         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
478                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
479         if verbose:
480             print "Running font-optimizer"
481             subprocess.check_call(optimizer_call)
482         else:
483             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
484         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
485     rmtree(tmpdir)
486     os.chdir(cwd)
487
488     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
489     contents = []
490     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
491     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
492     for st in attributes:
493         meta = toc_file.makeelement(NCXNS('meta'))
494         meta.set('name', st)
495         meta.set('content', '0')
496         toc_file[0].append(meta)
497     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
498     toc_file[0][1].set('content', str(toc.depth()))
499     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
500     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
501     zip.close()