fixes #938: dc:source in documents;
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from copy import deepcopy
12 from lxml import etree
13 import zipfile
14 from tempfile import mkdtemp
15 from shutil import rmtree
16
17 import sys
18
19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
20 from librarian.dcparser import BookInfo
21
22 from librarian import functions
23
24 functions.reg_person_name()
25
26
27 def inner_xml(node):
28     """ returns node's text and children as a string
29
30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
31     x<b>y</b>z
32     """
33
34     nt = node.text if node.text is not None else ''
35     return ''.join([nt] + [etree.tostring(child) for child in node]) 
36
37 def set_inner_xml(node, text):
38     """ sets node's text and children from a string
39
40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41     >>> set_inner_xml(e, 'x<b>y</b>z')
42     >>> print etree.tostring(e)
43     <a>x<b>y</b>z</a>
44     """
45
46     p = etree.fromstring('<x>%s</x>' % text)
47     node.text = p.text
48     node[:] = p[:]
49
50
51 def node_name(node):
52     """ Find out a node's name
53
54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
55     XYZ
56     """
57
58     tempnode = deepcopy(node)
59
60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61         for e in tempnode.findall('.//%s' % p):
62             t = e.tail
63             e.clear()
64             e.tail = t
65     etree.strip_tags(tempnode, '*')
66     return tempnode.text
67
68
69 def xslt(xml, sheet):
70     if isinstance(xml, etree._Element):
71         xml = etree.ElementTree(xml)
72     with open(sheet) as xsltf:
73         return xml.xslt(etree.parse(xsltf))
74
75
76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
77 def res(fname):
78     return os.path.join(_resdir, fname)
79
80
81 def replace_characters(node):
82     def replace_chars(text):
83         if text is None:
84             return None
85         return text.replace("---", u"\u2014")\
86                    .replace("--", u"\u2013")\
87                    .replace(",,", u"\u201E")\
88                    .replace('"', u"\u201D")\
89                    .replace("'", u"\u2019")
90     if node.tag == 'extra':
91         node.clear()
92     else:
93         node.text = replace_chars(node.text)
94         node.tail = replace_chars(node.tail)
95         for child in node:
96             replace_characters(child)
97
98
99 def find_annotations(annotations, source, part_no):
100     for child in source:
101         if child.tag in ('pe', 'pa', 'pt', 'pr'):
102             annotation = deepcopy(child)
103             number = str(len(annotations)+1)
104             annotation.set('number', number)
105             annotation.set('part', str(part_no))
106             annotation.tail = ''
107             annotations.append(annotation)
108             tail = child.tail
109             child.clear()
110             child.tail = tail
111             child.text = number
112         if child.tag not in ('extra', 'podtytul'):
113             find_annotations(annotations, child, part_no)
114
115
116 def replace_by_verse(tree):
117     """ Find stanzas and create new verses in place of a '/' character """
118
119     stanzas = tree.findall('.//' + WLNS('strofa'))
120     for node in stanzas:
121         for child_node in node:
122             if child_node.tag in ('slowo_obce', 'wyroznienie'):
123                 foreign_verses = inner_xml(child_node).split('/\n')
124                 if len(foreign_verses) > 1:
125                     new_foreign = ''
126                     for foreign_verse in foreign_verses:
127                         if foreign_verse.startswith('<wers'):
128                             new_foreign += foreign_verse
129                         else:
130                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
131                     set_inner_xml(child_node, new_foreign)
132         verses = inner_xml(node).split('/\n')
133         if len(verses) > 1:
134             modified_inner_xml = ''
135             for verse in verses:
136                 if verse.startswith('<wers') or verse.startswith('<extra'):
137                     modified_inner_xml += verse
138                 else:
139                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
140             set_inner_xml(node, modified_inner_xml)
141
142
143 def add_to_manifest(manifest, partno):
144     """ Adds a node to the manifest section in content.opf file """
145
146     partstr = 'part%d' % partno
147     e = manifest.makeelement(OPFNS('item'), attrib={
148                                  'id': partstr,
149                                  'href': partstr + '.html',
150                                  'media-type': 'application/xhtml+xml',
151                              })
152     manifest.append(e)
153
154
155 def add_to_spine(spine, partno):
156     """ Adds a node to the spine section in content.opf file """
157
158     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
159     spine.append(e)
160
161
162 class TOC(object):
163     def __init__(self, name=None, part_number=None):
164         self.children = []
165         self.name = name
166         self.part_number = part_number
167         self.sub_number = None
168
169     def add(self, name, part_number, level=0, is_part=True):
170         if level > 0 and self.children:
171             return self.children[-1].add(name, part_number, level-1, is_part)
172         else:
173             t = TOC(name)
174             t.part_number = part_number
175             self.children.append(t)
176             if not is_part:
177                 t.sub_number = len(self.children) + 1
178                 return t.sub_number
179
180     def append(self, toc):
181         self.children.append(toc)
182
183     def extend(self, toc):
184         self.children.extend(toc.children)
185
186     def depth(self):
187         if self.children:
188             return max((c.depth() for c in self.children)) + 1
189         else:
190             return 0
191
192     def write_to_xml(self, nav_map, counter):
193         for child in self.children:
194             nav_point = nav_map.makeelement(NCXNS('navPoint'))
195             nav_point.set('id', 'NavPoint-%d' % counter)
196             nav_point.set('playOrder', str(counter))
197
198             nav_label = nav_map.makeelement(NCXNS('navLabel'))
199             text = nav_map.makeelement(NCXNS('text'))
200             text.text = child.name
201             nav_label.append(text)
202             nav_point.append(nav_label)
203
204             content = nav_map.makeelement(NCXNS('content'))
205             src = 'part%d.html' % child.part_number
206             if child.sub_number is not None:
207                 src += '#sub%d' % child.sub_number
208             content.set('src', src)
209             nav_point.append(content)
210             nav_map.append(nav_point)
211             counter = child.write_to_xml(nav_point, counter + 1)
212         return counter
213
214
215 def used_chars(element):
216     """ Lists characters used in an ETree Element """
217     chars = set((element.text or '') + (element.tail or ''))
218     for child in element:
219         chars = chars.union(used_chars(child))
220     return chars
221
222
223 def chop(main_text):
224     """ divide main content of the XML file into chunks """
225
226     # prepare a container for each chunk
227     part_xml = etree.Element('utwor')
228     etree.SubElement(part_xml, 'master')
229     main_xml_part = part_xml[0] # master
230
231     last_node_part = False
232     for one_part in main_text:
233         name = one_part.tag
234         if name == 'naglowek_czesc':
235             yield part_xml
236             last_node_part = True
237             main_xml_part[:] = [deepcopy(one_part)]
238         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
239             yield part_xml
240             main_xml_part[:] = [deepcopy(one_part)]
241         else:
242             main_xml_part.append(deepcopy(one_part))
243             last_node_part = False
244     yield part_xml
245
246
247 def transform_chunk(chunk_xml, chunk_no, annotations):
248     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
249
250     toc = TOC()
251     for element in chunk_xml[0]:
252         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
253             toc.add(node_name(element), chunk_no)
254         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
255             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
256             element.set('sub', str(subnumber))
257     find_annotations(annotations, chunk_xml, chunk_no)
258     replace_by_verse(chunk_xml)
259     html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
260     chars = used_chars(html_tree.getroot())
261     output_html = etree.tostring(html_tree, pretty_print=True)
262     return output_html, toc, chars
263
264
265 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False):
266     """ produces a EPUB file
267
268     provider: a DocProvider
269     slug: slug of file to process, available by provider
270     output_file: file-like object or path to output file
271     output_dir: path to directory to save output file to; either this or output_file must be present
272     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
273     """
274
275     def transform_file(input_xml, chunk_counter=1, first=True):
276         """ processes one input file and proceeds to its children """
277
278         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
279
280         # every input file will have a TOC entry,
281         # pointing to starting chunk
282         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
283         chars = set()
284         if first:
285             # write book title page
286             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
287             chars = used_chars(html_tree.getroot())
288             zip.writestr('OPS/title.html',
289                  etree.tostring(html_tree, pretty_print=True))
290         elif children:
291             # write title page for every parent
292             html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
293             chars = used_chars(html_tree.getroot())
294             zip.writestr('OPS/part%d.html' % chunk_counter, 
295                 etree.tostring(html_tree, pretty_print=True))
296             add_to_manifest(manifest, chunk_counter)
297             add_to_spine(spine, chunk_counter)
298             chunk_counter += 1
299
300         if len(input_xml.getroot()) > 1:
301             # rdf before style master
302             main_text = input_xml.getroot()[1]
303         else:
304             # rdf in style master
305             main_text = input_xml.getroot()[0]
306             if main_text.tag == RDFNS('RDF'):
307                 main_text = None
308
309         if main_text is not None:
310             replace_characters(main_text)
311
312             for chunk_xml in chop(main_text):
313                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
314                 toc.extend(chunk_toc)
315                 chars = chars.union(chunk_chars)
316                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
317                 add_to_manifest(manifest, chunk_counter)
318                 add_to_spine(spine, chunk_counter)
319                 chunk_counter += 1
320
321         if children:
322             for child in children:
323                 child_xml = etree.parse(provider.by_uri(child))
324                 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
325                 toc.append(child_toc)
326                 chars = chars.union(chunk_chars)
327
328         return toc, chunk_counter, chars
329
330     # read metadata from the first file
331     if file_path:
332         if slug:
333             raise ValueError('slug or file_path should be specified, not both')
334         f = open(file_path, 'r')
335         input_xml = etree.parse(f)
336         f.close()
337     else:
338         if not slug:
339             raise ValueError('either slug or file_path should be specified')
340         input_xml = etree.parse(provider[slug])
341
342     metadata = input_xml.find('.//'+RDFNS('Description'))
343     if metadata is None:
344         raise NoDublinCore('Document has no DublinCore - which is required.')
345     book_info = BookInfo.from_element(input_xml)
346     metadata = etree.ElementTree(metadata)
347
348     # if output to dir, create the file
349     if output_dir is not None:
350         if make_dir:
351             author = unicode(book_info.author)
352             output_dir = os.path.join(output_dir, author)
353             try:
354                 os.makedirs(output_dir)
355             except OSError:
356                 pass
357         if slug:
358             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
359         else:
360             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
361
362     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
363
364     # write static elements
365     mime = zipfile.ZipInfo()
366     mime.filename = 'mimetype'
367     mime.compress_type = zipfile.ZIP_STORED
368     mime.extra = ''
369     zip.writestr(mime, 'application/epub+zip')
370     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
371                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
372                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
373                        'media-type="application/oebps-package+xml" />' \
374                        '</rootfiles></container>')
375     for fname in 'style.css', 'logo_wolnelektury.png':
376         zip.write(res(fname), os.path.join('OPS', fname))
377
378     opf = xslt(metadata, res('xsltContent.xsl'))
379     manifest = opf.find('.//' + OPFNS('manifest'))
380     spine = opf.find('.//' + OPFNS('spine'))
381
382     annotations = etree.Element('annotations')
383
384     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
385                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
386                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
387                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
388                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
389                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
390                                '</navPoint></navMap></ncx>')
391     nav_map = toc_file[-1]
392
393     toc, chunk_counter, chars = transform_file(input_xml)
394
395     if not toc.children:
396         toc.add(u"Początek utworu", 1)
397     toc_counter = toc.write_to_xml(nav_map, 2)
398
399     # Last modifications in container files and EPUB creation
400     if len(annotations) > 0:
401         nav_map.append(etree.fromstring(
402             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
403             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
404         manifest.append(etree.fromstring(
405             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
406         spine.append(etree.fromstring(
407             '<itemref idref="annotations" />'))
408         replace_by_verse(annotations)
409         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
410         chars = chars.union(used_chars(html_tree.getroot()))
411         zip.writestr('OPS/annotations.html', etree.tostring(
412                             html_tree, pretty_print=True))
413
414     # strip fonts
415     tmpdir = mkdtemp('-librarian-epub')
416     cwd = os.getcwd()
417
418     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
419     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
420         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)]
421         if verbose:
422             print "Running font-optimizer"
423             subprocess.check_call(optimizer_call)
424         else:
425             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
426         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
427     rmtree(tmpdir)
428     os.chdir(cwd)
429
430     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
431     contents = []
432     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
433     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
434     for st in attributes:
435         meta = toc_file.makeelement(NCXNS('meta'))
436         meta.set('name', st)
437         meta.set('content', '0')
438         toc_file[0].append(meta)
439     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
440     toc_file[0][1].set('content', str(toc.depth()))
441     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
442     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
443     zip.close()