epub fix
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from copy import deepcopy
12 from lxml import etree
13 import zipfile
14 from tempfile import mkdtemp
15 from shutil import rmtree
16
17 import sys
18
19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
20 from librarian.dcparser import BookInfo
21
22 from librarian import functions
23
24 functions.reg_person_name()
25
26
27 def inner_xml(node):
28     """ returns node's text and children as a string
29
30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
31     x<b>y</b>z
32     """
33
34     nt = node.text if node.text is not None else ''
35     return ''.join([nt] + [etree.tostring(child) for child in node]) 
36
37 def set_inner_xml(node, text):
38     """ sets node's text and children from a string
39
40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41     >>> set_inner_xml(e, 'x<b>y</b>z')
42     >>> print etree.tostring(e)
43     <a>x<b>y</b>z</a>
44     """
45
46     p = etree.fromstring('<x>%s</x>' % text)
47     node.text = p.text
48     node[:] = p[:]
49
50
51 def node_name(node):
52     """ Find out a node's name
53
54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
55     XYZ
56     """
57
58     tempnode = deepcopy(node)
59
60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61         for e in tempnode.findall('.//%s' % p):
62             t = e.tail
63             e.clear()
64             e.tail = t
65     etree.strip_tags(tempnode, '*')
66     return tempnode.text
67
68
69 def xslt(xml, sheet):
70     if isinstance(xml, etree._Element):
71         xml = etree.ElementTree(xml)
72     with open(sheet) as xsltf:
73         return xml.xslt(etree.parse(xsltf))
74
75
76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
77 def res(fname):
78     return os.path.join(_resdir, fname)
79
80
81 def replace_characters(node):
82     def replace_chars(text):
83         if text is None:
84             return None
85         return text.replace("---", u"\u2014")\
86                    .replace("--", u"\u2013")\
87                    .replace(",,", u"\u201E")\
88                    .replace('"', u"\u201D")\
89                    .replace("'", u"\u2019")
90     if node.tag == 'extra':
91         node.clear()
92     else:
93         node.text = replace_chars(node.text)
94         node.tail = replace_chars(node.tail)
95         for child in node:
96             replace_characters(child)
97
98
99 def find_annotations(annotations, source, part_no):
100     for child in source:
101         if child.tag in ('pe', 'pa', 'pt', 'pr'):
102             annotation = deepcopy(child)
103             number = str(len(annotations)+1)
104             annotation.set('number', number)
105             annotation.set('part', str(part_no))
106             annotation.tail = ''
107             annotations.append(annotation)
108             tail = child.tail
109             child.clear()
110             child.tail = tail
111             child.text = number
112         if child.tag not in ('extra', 'podtytul'):
113             find_annotations(annotations, child, part_no)
114
115
116 def replace_by_verse(tree):
117     """ Find stanzas and create new verses in place of a '/' character """
118
119     stanzas = tree.findall('.//' + WLNS('strofa'))
120     for node in stanzas:
121         for child_node in node:
122             if child_node.tag in ('slowo_obce', 'wyroznienie'):
123                 foreign_verses = inner_xml(child_node).split('/\n')
124                 if len(foreign_verses) > 1:
125                     new_foreign = ''
126                     for foreign_verse in foreign_verses:
127                         if foreign_verse.startswith('<wers'):
128                             new_foreign += foreign_verse
129                         else:
130                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
131                     set_inner_xml(child_node, new_foreign)
132         verses = inner_xml(node).split('/\n')
133         if len(verses) > 1:
134             modified_inner_xml = ''
135             for verse in verses:
136                 if verse.startswith('<wers') or verse.startswith('<extra'):
137                     modified_inner_xml += verse
138                 else:
139                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
140             set_inner_xml(node, modified_inner_xml)
141
142
143 def add_to_manifest(manifest, partno):
144     """ Adds a node to the manifest section in content.opf file """
145
146     partstr = 'part%d' % partno
147     e = manifest.makeelement(OPFNS('item'), attrib={
148                                  'id': partstr,
149                                  'href': partstr + '.html',
150                                  'media-type': 'application/xhtml+xml',
151                              })
152     manifest.append(e)
153
154
155 def add_to_spine(spine, partno):
156     """ Adds a node to the spine section in content.opf file """
157
158     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
159     spine.append(e)
160
161
162 class TOC(object):
163     def __init__(self, name=None, part_number=None):
164         self.children = []
165         self.name = name
166         self.part_number = part_number
167         self.sub_number = None
168
169     def add(self, name, part_number, level=0, is_part=True):
170         if level > 0 and self.children:
171             return self.children[-1].add(name, part_number, level-1, is_part)
172         else:
173             t = TOC(name)
174             t.part_number = part_number
175             self.children.append(t)
176             if not is_part:
177                 t.sub_number = len(self.children) + 1
178                 return t.sub_number
179
180     def append(self, toc):
181         self.children.append(toc)
182
183     def extend(self, toc):
184         self.children.extend(toc.children)
185
186     def depth(self):
187         if self.children:
188             return max((c.depth() for c in self.children)) + 1
189         else:
190             return 0
191
192     def write_to_xml(self, nav_map, counter):
193         for child in self.children:
194             nav_point = nav_map.makeelement(NCXNS('navPoint'))
195             nav_point.set('id', 'NavPoint-%d' % counter)
196             nav_point.set('playOrder', str(counter))
197
198             nav_label = nav_map.makeelement(NCXNS('navLabel'))
199             text = nav_map.makeelement(NCXNS('text'))
200             text.text = child.name
201             nav_label.append(text)
202             nav_point.append(nav_label)
203
204             content = nav_map.makeelement(NCXNS('content'))
205             src = 'part%d.html' % child.part_number
206             if child.sub_number is not None:
207                 src += '#sub%d' % child.sub_number
208             content.set('src', src)
209             nav_point.append(content)
210             nav_map.append(nav_point)
211             counter = child.write_to_xml(nav_point, counter + 1)
212         return counter
213
214
215 def used_chars(element):
216     """ Lists characters used in an ETree Element """
217     chars = set((element.text or '') + (element.tail or ''))
218     for child in element:
219         chars = chars.union(used_chars(child))
220     return chars
221
222
223 def chop(main_text):
224     """ divide main content of the XML file into chunks """
225
226     # prepare a container for each chunk
227     part_xml = etree.Element('utwor')
228     etree.SubElement(part_xml, 'master')
229     main_xml_part = part_xml[0] # master
230
231     last_node_part = False
232     for one_part in main_text:
233         name = one_part.tag
234         if name == 'naglowek_czesc':
235             yield part_xml
236             last_node_part = True
237             main_xml_part[:] = [deepcopy(one_part)]
238         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
239             yield part_xml
240             main_xml_part[:] = [deepcopy(one_part)]
241         else:
242             main_xml_part.append(deepcopy(one_part))
243             last_node_part = False
244     yield part_xml
245
246
247 def transform_chunk(chunk_xml, chunk_no, annotations):
248     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
249
250     toc = TOC()
251     for element in chunk_xml[0]:
252         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
253             toc.add(node_name(element), chunk_no)
254         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
255             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
256             element.set('sub', str(subnumber))
257     find_annotations(annotations, chunk_xml, chunk_no)
258     replace_by_verse(chunk_xml)
259     html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
260     chars = used_chars(html_tree.getroot())
261     output_html = etree.tostring(html_tree, pretty_print=True)
262     return output_html, toc, chars
263
264
265 def transform(provider, slug, output_file=None, output_dir=None, make_dir=False):
266     """ produces a EPUB file
267
268     provider: a DocProvider
269     slug: slug of file to process, available by provider
270     output_file: file-like object or path to output file
271     output_dir: path to directory to save output file to; either this or output_file must be present
272     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
273     """
274
275     def transform_file(input_xml, chunk_counter=1, first=True):
276         """ processes one input file and proceeds to its children """
277
278         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
279
280         # every input file will have a TOC entry,
281         # pointing to starting chunk
282         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
283         chars = set()
284         if first:
285             # write book title page
286             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
287             chars = used_chars(html_tree.getroot())
288             zip.writestr('OPS/title.html',
289                  etree.tostring(html_tree, pretty_print=True))
290         elif children:
291             # write title page for every parent
292             html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
293             chars = used_chars(html_tree.getroot())
294             zip.writestr('OPS/part%d.html' % chunk_counter, 
295                 etree.tostring(html_tree, pretty_print=True))
296             add_to_manifest(manifest, chunk_counter)
297             add_to_spine(spine, chunk_counter)
298             chunk_counter += 1
299
300         if len(input_xml.getroot()) > 1:
301             # rdf before style master
302             main_text = input_xml.getroot()[1]
303         else:
304             # rdf in style master
305             main_text = input_xml.getroot()[0]
306             if main_text.tag == RDFNS('RDF'):
307                 main_text = None
308
309         if main_text is not None:
310             replace_characters(main_text)
311
312             for chunk_xml in chop(main_text):
313                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
314                 toc.extend(chunk_toc)
315                 chars = chars.union(chunk_chars)
316                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
317                 add_to_manifest(manifest, chunk_counter)
318                 add_to_spine(spine, chunk_counter)
319                 chunk_counter += 1
320
321         if children:
322             for child in children:
323                 child_xml = etree.parse(provider.by_uri(child))
324                 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
325                 toc.append(child_toc)
326                 chars = chars.union(chunk_chars)
327
328         return toc, chunk_counter, chars
329
330     # read metadata from the first file
331     input_xml = etree.parse(provider[slug])
332     metadata = input_xml.find('.//'+RDFNS('Description'))
333     if metadata is None:
334         raise NoDublinCore('Document has no DublinCore - which is required.')
335     book_info = BookInfo.from_element(input_xml)
336     metadata = etree.ElementTree(metadata)
337
338     # if output to dir, create the file
339     if output_dir is not None:
340         if make_dir:
341             author = unicode(book_info.author)
342             output_dir = os.path.join(output_dir, author)
343             try:
344                 os.makedirs(output_dir)
345             except OSError:
346                 pass
347         output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
348
349     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
350
351     # write static elements
352     mime = zipfile.ZipInfo()
353     mime.filename = 'mimetype'
354     mime.compress_type = zipfile.ZIP_STORED
355     mime.extra = ''
356     zip.writestr(mime, 'application/epub+zip')
357     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
358                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
359                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
360                        'media-type="application/oebps-package+xml" />' \
361                        '</rootfiles></container>')
362     for fname in 'style.css', 'logo_wolnelektury.png':
363         zip.write(res(fname), os.path.join('OPS', fname))
364
365     opf = xslt(metadata, res('xsltContent.xsl'))
366     manifest = opf.find('.//' + OPFNS('manifest'))
367     spine = opf.find('.//' + OPFNS('spine'))
368
369     annotations = etree.Element('annotations')
370
371     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
372                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
373                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
374                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
375                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
376                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
377                                '</navPoint></navMap></ncx>')
378     nav_map = toc_file[-1]
379
380     toc, chunk_counter, chars = transform_file(input_xml)
381
382     if not toc.children:
383         toc.add(u"Początek utworu", 1)
384     toc_counter = toc.write_to_xml(nav_map, 2)
385
386     # Last modifications in container files and EPUB creation
387     if len(annotations) > 0:
388         nav_map.append(etree.fromstring(
389             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
390             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
391         manifest.append(etree.fromstring(
392             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
393         spine.append(etree.fromstring(
394             '<itemref idref="annotations" />'))
395         replace_by_verse(annotations)
396         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
397         chars = chars.union(used_chars(html_tree.getroot()))
398         zip.writestr('OPS/annotations.html', etree.tostring(
399                             html_tree, pretty_print=True))
400
401     # strip fonts
402     tmpdir = mkdtemp('-librarian-epub')
403     cwd = os.getcwd()
404
405     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
406     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
407         subprocess.check_call(['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
408         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
409     rmtree(tmpdir)
410     os.chdir(cwd)
411
412     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
413     contents = []
414     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
415     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
416     for st in attributes:
417         meta = toc_file.makeelement(NCXNS('meta'))
418         meta.set('name', st)
419         meta.set('content', '0')
420         toc_file[0].append(meta)
421     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
422     toc_file[0][1].set('content', str(toc.depth()))
423     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
424     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
425     zip.close()