Allow using remote cache for image downloading. Also, DRY in book2* scripts
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15 from tempfile import mkdtemp, NamedTemporaryFile
16 from shutil import rmtree
17
18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
19 from librarian.cover import WLCover
20
21 from librarian import functions, get_resource
22
23 functions.reg_person_name()
24
25
26 def inner_xml(node):
27     """ returns node's text and children as a string
28
29     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
30     x<b>y</b>z
31     """
32
33     nt = node.text if node.text is not None else ''
34     return ''.join([nt] + [etree.tostring(child) for child in node])
35
36 def set_inner_xml(node, text):
37     """ sets node's text and children from a string
38
39     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
40     >>> set_inner_xml(e, 'x<b>y</b>z')
41     >>> print etree.tostring(e)
42     <a>x<b>y</b>z</a>
43     """
44
45     p = etree.fromstring('<x>%s</x>' % text)
46     node.text = p.text
47     node[:] = p[:]
48
49
50 def node_name(node):
51     """ Find out a node's name
52
53     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
54     XYZ
55     """
56
57     tempnode = deepcopy(node)
58
59     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
60         for e in tempnode.findall('.//%s' % p):
61             t = e.tail
62             e.clear()
63             e.tail = t
64     etree.strip_tags(tempnode, '*')
65     return tempnode.text
66
67
68 def xslt(xml, sheet):
69     if isinstance(xml, etree._Element):
70         xml = etree.ElementTree(xml)
71     with open(sheet) as xsltf:
72         return xml.xslt(etree.parse(xsltf))
73
74
75 def replace_characters(node):
76     def replace_chars(text):
77         if text is None:
78             return None
79         return text.replace(u"\ufeff", u"")\
80                    .replace("---", u"\u2014")\
81                    .replace("--", u"\u2013")\
82                    .replace(",,", u"\u201E")\
83                    .replace('"', u"\u201D")\
84                    .replace("'", u"\u2019")
85     if node.tag in ('uwaga', 'extra'):
86         t = node.tail
87         node.clear()
88         node.tail = t
89     node.text = replace_chars(node.text)
90     node.tail = replace_chars(node.tail)
91     for child in node:
92         replace_characters(child)
93
94
95 def find_annotations(annotations, source, part_no):
96     for child in source:
97         if child.tag in ('pe', 'pa', 'pt', 'pr'):
98             annotation = deepcopy(child)
99             number = str(len(annotations)+1)
100             annotation.set('number', number)
101             annotation.set('part', str(part_no))
102             annotation.tail = ''
103             annotations.append(annotation)
104             tail = child.tail
105             child.clear()
106             child.tail = tail
107             child.text = number
108         if child.tag not in ('extra', 'uwaga'):
109             find_annotations(annotations, child, part_no)
110
111
112 def replace_by_verse(tree):
113     """ Find stanzas and create new verses in place of a '/' character """
114
115     stanzas = tree.findall('.//' + WLNS('strofa'))
116     for node in stanzas:
117         for child_node in node:
118             if child_node.tag in ('slowo_obce', 'wyroznienie'):
119                 foreign_verses = inner_xml(child_node).split('/\n')
120                 if len(foreign_verses) > 1:
121                     new_foreign = ''
122                     for foreign_verse in foreign_verses:
123                         if foreign_verse.startswith('<wers'):
124                             new_foreign += foreign_verse
125                         else:
126                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
127                     set_inner_xml(child_node, new_foreign)
128         verses = inner_xml(node).split('/\n')
129         if len(verses) > 1:
130             modified_inner_xml = ''
131             for verse in verses:
132                 if verse.startswith('<wers') or verse.startswith('<extra'):
133                     modified_inner_xml += verse
134                 else:
135                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
136             set_inner_xml(node, modified_inner_xml)
137
138
139 def add_to_manifest(manifest, partno):
140     """ Adds a node to the manifest section in content.opf file """
141
142     partstr = 'part%d' % partno
143     e = manifest.makeelement(OPFNS('item'), attrib={
144                                  'id': partstr,
145                                  'href': partstr + '.html',
146                                  'media-type': 'application/xhtml+xml',
147                              })
148     manifest.append(e)
149
150
151 def add_to_spine(spine, partno):
152     """ Adds a node to the spine section in content.opf file """
153
154     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
155     spine.append(e)
156
157
158 class TOC(object):
159     def __init__(self, name=None, part_href=None):
160         self.children = []
161         self.name = name
162         self.part_href = part_href
163         self.sub_number = None
164
165     def add(self, name, part_href, level=0, is_part=True, index=None):
166         assert level == 0 or index is None
167         if level > 0 and self.children:
168             return self.children[-1].add(name, part_href, level-1, is_part)
169         else:
170             t = TOC(name)
171             t.part_href = part_href
172             if index is not None:
173                 self.children.insert(index, t)
174             else:
175                 self.children.append(t)
176             if not is_part:
177                 t.sub_number = len(self.children) + 1
178                 return t.sub_number
179
180     def append(self, toc):
181         self.children.append(toc)
182
183     def extend(self, toc):
184         self.children.extend(toc.children)
185
186     def depth(self):
187         if self.children:
188             return max((c.depth() for c in self.children)) + 1
189         else:
190             return 0
191
192     def href(self):
193         src = self.part_href
194         if self.sub_number is not None:
195             src += '#sub%d' % self.sub_number
196         return src
197
198     def write_to_xml(self, nav_map, counter=1):
199         for child in self.children:
200             nav_point = nav_map.makeelement(NCXNS('navPoint'))
201             nav_point.set('id', 'NavPoint-%d' % counter)
202             nav_point.set('playOrder', str(counter))
203
204             nav_label = nav_map.makeelement(NCXNS('navLabel'))
205             text = nav_map.makeelement(NCXNS('text'))
206             text.text = child.name
207             nav_label.append(text)
208             nav_point.append(nav_label)
209
210             content = nav_map.makeelement(NCXNS('content'))
211             content.set('src', child.href())
212             nav_point.append(content)
213             nav_map.append(nav_point)
214             counter = child.write_to_xml(nav_point, counter + 1)
215         return counter
216
217     def html_part(self, depth=0):
218         texts = []
219         for child in self.children:
220             texts.append(
221                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
222                 (depth, child.href(), child.name))
223             texts.append(child.html_part(depth+1))
224         return "\n".join(texts)
225
226     def html(self):
227         with open(get_resource('epub/toc.html')) as f:
228             t = unicode(f.read(), 'utf-8')
229         return t % self.html_part()
230
231
232 def used_chars(element):
233     """ Lists characters used in an ETree Element """
234     chars = set((element.text or '') + (element.tail or ''))
235     for child in element:
236         chars = chars.union(used_chars(child))
237     return chars
238
239
240 def chop(main_text):
241     """ divide main content of the XML file into chunks """
242
243     # prepare a container for each chunk
244     part_xml = etree.Element('utwor')
245     etree.SubElement(part_xml, 'master')
246     main_xml_part = part_xml[0] # master
247
248     last_node_part = False
249     for one_part in main_text:
250         name = one_part.tag
251         if name == 'naglowek_czesc':
252             yield part_xml
253             last_node_part = True
254             main_xml_part[:] = [deepcopy(one_part)]
255         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
256             yield part_xml
257             main_xml_part[:] = [deepcopy(one_part)]
258         else:
259             main_xml_part.append(deepcopy(one_part))
260             last_node_part = False
261     yield part_xml
262
263
264 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
265     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
266
267     toc = TOC()
268     for element in chunk_xml[0]:
269         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
270             toc.add(node_name(element), "part%d.html" % chunk_no)
271         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
272             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
273             element.set('sub', str(subnumber))
274     if empty:
275         if not _empty_html_static:
276             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
277         chars = set()
278         output_html = _empty_html_static[0]
279     else:
280         find_annotations(annotations, chunk_xml, chunk_no)
281         replace_by_verse(chunk_xml)
282         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
283         chars = used_chars(html_tree.getroot())
284         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
285     return output_html, toc, chars
286
287
288 def transform(wldoc, verbose=False,
289               style=None, html_toc=False,
290               sample=None, cover=None, flags=None):
291     """ produces a EPUB file
292
293     sample=n: generate sample e-book (with at least n paragraphs)
294     cover: a cover.Cover factory or True for default
295     flags: less-advertising, without-fonts, working-copy
296     """
297
298     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
299         """ processes one input file and proceeds to its children """
300
301         replace_characters(wldoc.edoc.getroot())
302
303         # every input file will have a TOC entry,
304         # pointing to starting chunk
305         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
306         chars = set()
307         if first:
308             # write book title page
309             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
310             chars = used_chars(html_tree.getroot())
311             zip.writestr('OPS/title.html',
312                  etree.tostring(html_tree, method="html", pretty_print=True))
313             # add a title page TOC entry
314             toc.add(u"Strona tytułowa", "title.html")
315         elif wldoc.book_info.parts:
316             # write title page for every parent
317             if sample is not None and sample <= 0:
318                 chars = set()
319                 html_string = open(get_resource('epub/emptyChunk.html')).read()
320             else:
321                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
322                 chars = used_chars(html_tree.getroot())
323                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
324             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
325             add_to_manifest(manifest, chunk_counter)
326             add_to_spine(spine, chunk_counter)
327             chunk_counter += 1
328
329         if len(wldoc.edoc.getroot()) > 1:
330             # rdf before style master
331             main_text = wldoc.edoc.getroot()[1]
332         else:
333             # rdf in style master
334             main_text = wldoc.edoc.getroot()[0]
335             if main_text.tag == RDFNS('RDF'):
336                 main_text = None
337
338         if main_text is not None:
339             for chunk_xml in chop(main_text):
340                 empty = False
341                 if sample is not None:
342                     if sample <= 0:
343                         empty = True
344                     else:
345                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
346                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
347
348                 toc.extend(chunk_toc)
349                 chars = chars.union(chunk_chars)
350                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
351                 add_to_manifest(manifest, chunk_counter)
352                 add_to_spine(spine, chunk_counter)
353                 chunk_counter += 1
354
355         for child in wldoc.parts():
356             child_toc, chunk_counter, chunk_chars, sample = transform_file(
357                 child, chunk_counter, first=False, sample=sample)
358             toc.append(child_toc)
359             chars = chars.union(chunk_chars)
360
361         return toc, chunk_counter, chars, sample
362
363
364     document = deepcopy(wldoc)
365     del wldoc
366
367     if flags:
368         for flag in flags:
369             document.edoc.getroot().set(flag, 'yes')
370
371     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
372     manifest = opf.find('.//' + OPFNS('manifest'))
373     guide = opf.find('.//' + OPFNS('guide'))
374     spine = opf.find('.//' + OPFNS('spine'))
375
376     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
377     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
378
379     # write static elements
380     mime = zipfile.ZipInfo()
381     mime.filename = 'mimetype'
382     mime.compress_type = zipfile.ZIP_STORED
383     mime.extra = ''
384     zip.writestr(mime, 'application/epub+zip')
385     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
386                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
387                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
388                        'media-type="application/oebps-package+xml" />' \
389                        '</rootfiles></container>')
390     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
391     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
392     if not style:
393         style = get_resource('epub/style.css')
394     zip.write(style, os.path.join('OPS', 'style.css'))
395
396     if cover:
397         if cover is True:
398             cover = WLCover
399
400         cover_file = StringIO()
401         bound_cover = cover(document.book_info)
402         bound_cover.save(cover_file)
403         cover_name = 'cover.%s' % bound_cover.ext()
404         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
405         del cover_file
406
407         cover_tree = etree.parse(get_resource('epub/cover.html'))
408         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
409         zip.writestr('OPS/cover.html', etree.tostring(
410                         cover_tree, method="html", pretty_print=True))
411
412         if bound_cover.uses_dc_cover:
413             if document.book_info.cover_by:
414                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
415             if document.book_info.cover_source:
416                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
417
418         manifest.append(etree.fromstring(
419             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
420         manifest.append(etree.fromstring(
421             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
422         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
423         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
424         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
425
426
427     annotations = etree.Element('annotations')
428
429     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
430                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
431                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
432                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
433                                '</navMap></ncx>')
434     nav_map = toc_file[-1]
435
436     if html_toc:
437         manifest.append(etree.fromstring(
438             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
439         spine.append(etree.fromstring(
440             '<itemref idref="html_toc" />'))
441         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
442
443     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
444
445     if len(toc.children) < 2:
446         toc.add(u"Początek utworu", "part1.html")
447
448     # Last modifications in container files and EPUB creation
449     if len(annotations) > 0:
450         toc.add("Przypisy", "annotations.html")
451         manifest.append(etree.fromstring(
452             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
453         spine.append(etree.fromstring(
454             '<itemref idref="annotations" />'))
455         replace_by_verse(annotations)
456         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
457         chars = chars.union(used_chars(html_tree.getroot()))
458         zip.writestr('OPS/annotations.html', etree.tostring(
459                             html_tree, method="html", pretty_print=True))
460
461     toc.add("Strona redakcyjna", "last.html")
462     manifest.append(etree.fromstring(
463         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
464     spine.append(etree.fromstring(
465         '<itemref idref="last" />'))
466     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
467     chars.update(used_chars(html_tree.getroot()))
468     zip.writestr('OPS/last.html', etree.tostring(
469                         html_tree, method="html", pretty_print=True))
470
471     if not flags or not 'without-fonts' in flags:
472         # strip fonts
473         tmpdir = mkdtemp('-librarian-epub')
474         cwd = os.getcwd()
475
476         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
477         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
478             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
479                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
480             if verbose:
481                 print "Running font-optimizer"
482                 subprocess.check_call(optimizer_call)
483             else:
484                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
485             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
486             manifest.append(etree.fromstring(
487                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
488         rmtree(tmpdir)
489         os.chdir(cwd)
490
491     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
492     title = document.book_info.title
493     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
494     for st in attributes:
495         meta = toc_file.makeelement(NCXNS('meta'))
496         meta.set('name', st)
497         meta.set('content', '0')
498         toc_file[0].append(meta)
499     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
500     toc_file[0][1].set('content', str(toc.depth()))
501     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
502
503     # write TOC
504     if html_toc:
505         toc.add(u"Spis treści", "toc.html", index=1)
506         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
507     toc.write_to_xml(nav_map)
508     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
509     zip.close()
510
511     return OutputFile.from_filename(output_file.name)