666bcc9295b4fb31f8a48665e8a993d5b6080c27
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from copy import deepcopy
12 from lxml import etree
13 import zipfile
14 from tempfile import mkdtemp
15 from shutil import rmtree
16
17 import sys
18
19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
20 from librarian.dcparser import BookInfo
21
22 from librarian import functions, get_resource
23
24 functions.reg_person_name()
25
26
27 def inner_xml(node):
28     """ returns node's text and children as a string
29
30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
31     x<b>y</b>z
32     """
33
34     nt = node.text if node.text is not None else ''
35     return ''.join([nt] + [etree.tostring(child) for child in node]) 
36
37 def set_inner_xml(node, text):
38     """ sets node's text and children from a string
39
40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41     >>> set_inner_xml(e, 'x<b>y</b>z')
42     >>> print etree.tostring(e)
43     <a>x<b>y</b>z</a>
44     """
45
46     p = etree.fromstring('<x>%s</x>' % text)
47     node.text = p.text
48     node[:] = p[:]
49
50
51 def node_name(node):
52     """ Find out a node's name
53
54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
55     XYZ
56     """
57
58     tempnode = deepcopy(node)
59
60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61         for e in tempnode.findall('.//%s' % p):
62             t = e.tail
63             e.clear()
64             e.tail = t
65     etree.strip_tags(tempnode, '*')
66     return tempnode.text
67
68
69 def xslt(xml, sheet):
70     if isinstance(xml, etree._Element):
71         xml = etree.ElementTree(xml)
72     with open(sheet) as xsltf:
73         return xml.xslt(etree.parse(xsltf))
74
75
76 def replace_characters(node):
77     def replace_chars(text):
78         if text is None:
79             return None
80         return text.replace(u"\ufeff", u"")\
81                    .replace("---", u"\u2014")\
82                    .replace("--", u"\u2013")\
83                    .replace(",,", u"\u201E")\
84                    .replace('"', u"\u201D")\
85                    .replace("'", u"\u2019")
86     if node.tag == 'extra':
87         node.clear()
88     else:
89         node.text = replace_chars(node.text)
90         node.tail = replace_chars(node.tail)
91         for child in node:
92             replace_characters(child)
93
94
95 def find_annotations(annotations, source, part_no):
96     for child in source:
97         if child.tag in ('pe', 'pa', 'pt', 'pr'):
98             annotation = deepcopy(child)
99             number = str(len(annotations)+1)
100             annotation.set('number', number)
101             annotation.set('part', str(part_no))
102             annotation.tail = ''
103             annotations.append(annotation)
104             tail = child.tail
105             child.clear()
106             child.tail = tail
107             child.text = number
108         if child.tag not in ('extra', 'podtytul'):
109             find_annotations(annotations, child, part_no)
110
111
112 def replace_by_verse(tree):
113     """ Find stanzas and create new verses in place of a '/' character """
114
115     stanzas = tree.findall('.//' + WLNS('strofa'))
116     for node in stanzas:
117         for child_node in node:
118             if child_node.tag in ('slowo_obce', 'wyroznienie'):
119                 foreign_verses = inner_xml(child_node).split('/\n')
120                 if len(foreign_verses) > 1:
121                     new_foreign = ''
122                     for foreign_verse in foreign_verses:
123                         if foreign_verse.startswith('<wers'):
124                             new_foreign += foreign_verse
125                         else:
126                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
127                     set_inner_xml(child_node, new_foreign)
128         verses = inner_xml(node).split('/\n')
129         if len(verses) > 1:
130             modified_inner_xml = ''
131             for verse in verses:
132                 if verse.startswith('<wers') or verse.startswith('<extra'):
133                     modified_inner_xml += verse
134                 else:
135                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
136             set_inner_xml(node, modified_inner_xml)
137
138
139 def add_to_manifest(manifest, partno):
140     """ Adds a node to the manifest section in content.opf file """
141
142     partstr = 'part%d' % partno
143     e = manifest.makeelement(OPFNS('item'), attrib={
144                                  'id': partstr,
145                                  'href': partstr + '.html',
146                                  'media-type': 'application/xhtml+xml',
147                              })
148     manifest.append(e)
149
150
151 def add_to_spine(spine, partno):
152     """ Adds a node to the spine section in content.opf file """
153
154     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
155     spine.append(e)
156
157
158 class TOC(object):
159     def __init__(self, name=None, part_number=None):
160         self.children = []
161         self.name = name
162         self.part_number = part_number
163         self.sub_number = None
164
165     def add(self, name, part_number, level=0, is_part=True):
166         if level > 0 and self.children:
167             return self.children[-1].add(name, part_number, level-1, is_part)
168         else:
169             t = TOC(name)
170             t.part_number = part_number
171             self.children.append(t)
172             if not is_part:
173                 t.sub_number = len(self.children) + 1
174                 return t.sub_number
175
176     def append(self, toc):
177         self.children.append(toc)
178
179     def extend(self, toc):
180         self.children.extend(toc.children)
181
182     def depth(self):
183         if self.children:
184             return max((c.depth() for c in self.children)) + 1
185         else:
186             return 0
187
188     def write_to_xml(self, nav_map, counter):
189         for child in self.children:
190             nav_point = nav_map.makeelement(NCXNS('navPoint'))
191             nav_point.set('id', 'NavPoint-%d' % counter)
192             nav_point.set('playOrder', str(counter))
193
194             nav_label = nav_map.makeelement(NCXNS('navLabel'))
195             text = nav_map.makeelement(NCXNS('text'))
196             text.text = child.name
197             nav_label.append(text)
198             nav_point.append(nav_label)
199
200             content = nav_map.makeelement(NCXNS('content'))
201             src = 'part%d.html' % child.part_number
202             if child.sub_number is not None:
203                 src += '#sub%d' % child.sub_number
204             content.set('src', src)
205             nav_point.append(content)
206             nav_map.append(nav_point)
207             counter = child.write_to_xml(nav_point, counter + 1)
208         return counter
209
210
211 def used_chars(element):
212     """ Lists characters used in an ETree Element """
213     chars = set((element.text or '') + (element.tail or ''))
214     for child in element:
215         chars = chars.union(used_chars(child))
216     return chars
217
218
219 def chop(main_text):
220     """ divide main content of the XML file into chunks """
221
222     # prepare a container for each chunk
223     part_xml = etree.Element('utwor')
224     etree.SubElement(part_xml, 'master')
225     main_xml_part = part_xml[0] # master
226
227     last_node_part = False
228     for one_part in main_text:
229         name = one_part.tag
230         if name == 'naglowek_czesc':
231             yield part_xml
232             last_node_part = True
233             main_xml_part[:] = [deepcopy(one_part)]
234         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
235             yield part_xml
236             main_xml_part[:] = [deepcopy(one_part)]
237         else:
238             main_xml_part.append(deepcopy(one_part))
239             last_node_part = False
240     yield part_xml
241
242
243 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
244     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
245
246     toc = TOC()
247     for element in chunk_xml[0]:
248         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
249             toc.add(node_name(element), chunk_no)
250         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
251             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
252             element.set('sub', str(subnumber))
253     if empty:
254         if not _empty_html_static:
255             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
256         chars = set()
257         output_html = _empty_html_static[0]
258     else:
259         find_annotations(annotations, chunk_xml, chunk_no)
260         replace_by_verse(chunk_xml)
261         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
262         chars = used_chars(html_tree.getroot())
263         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
264     return output_html, toc, chars
265
266
267 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None):
268     """ produces a EPUB file
269
270     provider: a DocProvider
271     slug: slug of file to process, available by provider
272     output_file: file-like object or path to output file
273     output_dir: path to directory to save output file to; either this or output_file must be present
274     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
275     sample=n: generate sample e-book (with at least n paragraphs)
276     """
277
278     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
279         """ processes one input file and proceeds to its children """
280
281         replace_characters(input_xml.getroot())
282
283         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
284
285         # every input file will have a TOC entry,
286         # pointing to starting chunk
287         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
288         chars = set()
289         if first:
290             # write book title page
291             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
292             chars = used_chars(html_tree.getroot())
293             zip.writestr('OPS/title.html',
294                  etree.tostring(html_tree, method="html", pretty_print=True))
295         elif children:
296             # write title page for every parent
297             if sample is not None and sample <= 0:
298                 chars = set()
299                 html_string = open(get_resource('epub/emptyChunk.html')).read()
300             else:
301                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
302                 chars = used_chars(html_tree.getroot())
303                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
304             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
305             add_to_manifest(manifest, chunk_counter)
306             add_to_spine(spine, chunk_counter)
307             chunk_counter += 1
308
309         if len(input_xml.getroot()) > 1:
310             # rdf before style master
311             main_text = input_xml.getroot()[1]
312         else:
313             # rdf in style master
314             main_text = input_xml.getroot()[0]
315             if main_text.tag == RDFNS('RDF'):
316                 main_text = None
317
318         if main_text is not None:
319             for chunk_xml in chop(main_text):
320                 empty = False
321                 if sample is not None:
322                     if sample <= 0:
323                         empty = True
324                     else:
325                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
326                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
327
328                 toc.extend(chunk_toc)
329                 chars = chars.union(chunk_chars)
330                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
331                 add_to_manifest(manifest, chunk_counter)
332                 add_to_spine(spine, chunk_counter)
333                 chunk_counter += 1
334
335         if children:
336             for child in children:
337                 child_xml = etree.parse(provider.by_uri(child))
338                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
339                 toc.append(child_toc)
340                 chars = chars.union(chunk_chars)
341
342         return toc, chunk_counter, chars, sample
343
344     # read metadata from the first file
345     if file_path:
346         if slug:
347             raise ValueError('slug or file_path should be specified, not both')
348         f = open(file_path, 'r')
349         input_xml = etree.parse(f)
350         f.close()
351     else:
352         if not slug:
353             raise ValueError('either slug or file_path should be specified')
354         input_xml = etree.parse(provider[slug])
355
356     metadata = input_xml.find('.//'+RDFNS('Description'))
357     if metadata is None:
358         raise NoDublinCore('Document has no DublinCore - which is required.')
359     book_info = BookInfo.from_element(input_xml)
360     metadata = etree.ElementTree(metadata)
361
362     # if output to dir, create the file
363     if output_dir is not None:
364         if make_dir:
365             author = unicode(book_info.author)
366             output_dir = os.path.join(output_dir, author)
367             try:
368                 os.makedirs(output_dir)
369             except OSError:
370                 pass
371         if slug:
372             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
373         else:
374             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
375
376     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
377
378     # write static elements
379     mime = zipfile.ZipInfo()
380     mime.filename = 'mimetype'
381     mime.compress_type = zipfile.ZIP_STORED
382     mime.extra = ''
383     zip.writestr(mime, 'application/epub+zip')
384     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
385                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
386                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
387                        'media-type="application/oebps-package+xml" />' \
388                        '</rootfiles></container>')
389     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
390     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
391
392     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
393     manifest = opf.find('.//' + OPFNS('manifest'))
394     spine = opf.find('.//' + OPFNS('spine'))
395
396     annotations = etree.Element('annotations')
397
398     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
399                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
400                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
401                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
402                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
403                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
404                                '</navPoint></navMap></ncx>')
405     nav_map = toc_file[-1]
406
407     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
408
409     if not toc.children:
410         toc.add(u"Początek utworu", 1)
411     toc_counter = toc.write_to_xml(nav_map, 2)
412
413     # Last modifications in container files and EPUB creation
414     if len(annotations) > 0:
415         nav_map.append(etree.fromstring(
416             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
417             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
418         manifest.append(etree.fromstring(
419             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
420         spine.append(etree.fromstring(
421             '<itemref idref="annotations" />'))
422         replace_by_verse(annotations)
423         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
424         chars = chars.union(used_chars(html_tree.getroot()))
425         zip.writestr('OPS/annotations.html', etree.tostring(
426                             html_tree, method="html", pretty_print=True))
427
428     # strip fonts
429     tmpdir = mkdtemp('-librarian-epub')
430     cwd = os.getcwd()
431
432     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
433     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
434         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), 
435                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
436         if verbose:
437             print "Running font-optimizer"
438             subprocess.check_call(optimizer_call)
439         else:
440             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
441         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
442     rmtree(tmpdir)
443     os.chdir(cwd)
444
445     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
446     contents = []
447     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
448     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
449     for st in attributes:
450         meta = toc_file.makeelement(NCXNS('meta'))
451         meta.set('name', st)
452         meta.set('content', '0')
453         toc_file[0].append(meta)
454     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
455     toc_file[0][1].set('content', str(toc.depth()))
456     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
457     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
458     zip.close()