[txt] return Windows friendly txt file with \r\n line ends instead of \n
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import re
11 import subprocess
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
15 import zipfile
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
18
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
21
22 from librarian import functions, get_resource
23
24 functions.reg_person_name()
25 functions.reg_lang_code_3to2()
26
27
28 def inner_xml(node):
29     """ returns node's text and children as a string
30
31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32     x<b>y</b>z
33     """
34
35     nt = node.text if node.text is not None else ''
36     return ''.join([nt] + [etree.tostring(child) for child in node])
37
38 def set_inner_xml(node, text):
39     """ sets node's text and children from a string
40
41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42     >>> set_inner_xml(e, 'x<b>y</b>z')
43     >>> print etree.tostring(e)
44     <a>x<b>y</b>z</a>
45     """
46
47     p = etree.fromstring('<x>%s</x>' % text)
48     node.text = p.text
49     node[:] = p[:]
50
51
52 def node_name(node):
53     """ Find out a node's name
54
55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56     XYZ
57     """
58
59     tempnode = deepcopy(node)
60
61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62         for e in tempnode.findall('.//%s' % p):
63             t = e.tail
64             e.clear()
65             e.tail = t
66     etree.strip_tags(tempnode, '*')
67     return tempnode.text
68
69
70 def xslt(xml, sheet):
71     if isinstance(xml, etree._Element):
72         xml = etree.ElementTree(xml)
73     with open(sheet) as xsltf:
74         return xml.xslt(etree.parse(xsltf))
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace(u"\ufeff", u"")\
82                    .replace("---", u"\u2014")\
83                    .replace("--", u"\u2013")\
84                    .replace(",,", u"\u201E")\
85                    .replace('"', u"\u201D")\
86                    .replace("'", u"\u2019")
87     if node.tag in ('uwaga', 'extra'):
88         t = node.tail
89         node.clear()
90         node.tail = t
91     node.text = replace_chars(node.text)
92     node.tail = replace_chars(node.tail)
93     for child in node:
94         replace_characters(child)
95
96
97 def find_annotations(annotations, source, part_no):
98     for child in source:
99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
100             annotation = deepcopy(child)
101             number = str(len(annotations)+1)
102             annotation.set('number', number)
103             annotation.set('part', str(part_no))
104             annotation.tail = ''
105             annotations.append(annotation)
106             tail = child.tail
107             child.clear()
108             child.tail = tail
109             child.text = number
110         if child.tag not in ('extra', 'uwaga'):
111             find_annotations(annotations, child, part_no)
112
113
114 class Stanza(object):
115     """
116     Converts / verse endings into verse elements in a stanza.
117
118     Slashes may only occur directly in the stanza. Any slashes in subelements
119     will be ignored, and the subelements will be put inside verse elements.
120
121     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
122     >>> Stanza(s).versify()
123     >>> print etree.tostring(s)
124     <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
125     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
126     
127     """
128     def __init__(self, stanza_elem):
129         self.stanza = stanza_elem
130         self.verses = []
131         self.open_verse = None
132
133     def versify(self):
134         self.push_text(self.stanza.text)
135         for elem in self.stanza:
136             self.push_elem(elem)
137             self.push_text(elem.tail)
138         tail = self.stanza.tail
139         self.stanza.clear()
140         self.stanza.tail = tail
141         self.stanza.extend(self.verses)
142
143     def open_normal_verse(self):
144         self.open_verse = self.stanza.makeelement("wers_normalny")
145         self.verses.append(self.open_verse)
146
147     def get_open_verse(self):
148         if self.open_verse is None:
149             self.open_normal_verse()
150         return self.open_verse
151
152     def push_text(self, text):
153         if not text:
154             return
155         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
156             if i:
157                 self.open_normal_verse()
158             verse = self.get_open_verse()
159             if len(verse):
160                 verse[-1].tail = (verse[-1].tail or "") + verse_text
161             else:
162                 verse.text = (verse.text or "") + verse_text
163
164     def push_elem(self, elem):
165         if elem.tag.startswith("wers"):
166             verse = deepcopy(elem)
167             verse.tail = None
168             self.verses.append(verse)
169             self.open_verse = verse
170         else:
171             appended = deepcopy(elem)
172             appended.tail = None
173             self.get_open_verse().append(appended)
174
175
176 def replace_by_verse(tree):
177     """ Find stanzas and create new verses in place of a '/' character """
178
179     stanzas = tree.findall('.//' + WLNS('strofa'))
180     for stanza in stanzas:
181         Stanza(stanza).versify()
182
183
184 def add_to_manifest(manifest, partno):
185     """ Adds a node to the manifest section in content.opf file """
186
187     partstr = 'part%d' % partno
188     e = manifest.makeelement(OPFNS('item'), attrib={
189                                  'id': partstr,
190                                  'href': partstr + '.html',
191                                  'media-type': 'application/xhtml+xml',
192                              })
193     manifest.append(e)
194
195
196 def add_to_spine(spine, partno):
197     """ Adds a node to the spine section in content.opf file """
198
199     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
200     spine.append(e)
201
202
203 class TOC(object):
204     def __init__(self, name=None, part_href=None):
205         self.children = []
206         self.name = name
207         self.part_href = part_href
208         self.sub_number = None
209
210     def add(self, name, part_href, level=0, is_part=True, index=None):
211         assert level == 0 or index is None
212         if level > 0 and self.children:
213             return self.children[-1].add(name, part_href, level-1, is_part)
214         else:
215             t = TOC(name)
216             t.part_href = part_href
217             if index is not None:
218                 self.children.insert(index, t)
219             else:
220                 self.children.append(t)
221             if not is_part:
222                 t.sub_number = len(self.children) + 1
223                 return t.sub_number
224
225     def append(self, toc):
226         self.children.append(toc)
227
228     def extend(self, toc):
229         self.children.extend(toc.children)
230
231     def depth(self):
232         if self.children:
233             return max((c.depth() for c in self.children)) + 1
234         else:
235             return 0
236
237     def href(self):
238         src = self.part_href
239         if self.sub_number is not None:
240             src += '#sub%d' % self.sub_number
241         return src
242
243     def write_to_xml(self, nav_map, counter=1):
244         for child in self.children:
245             nav_point = nav_map.makeelement(NCXNS('navPoint'))
246             nav_point.set('id', 'NavPoint-%d' % counter)
247             nav_point.set('playOrder', str(counter))
248
249             nav_label = nav_map.makeelement(NCXNS('navLabel'))
250             text = nav_map.makeelement(NCXNS('text'))
251             text.text = child.name
252             nav_label.append(text)
253             nav_point.append(nav_label)
254
255             content = nav_map.makeelement(NCXNS('content'))
256             content.set('src', child.href())
257             nav_point.append(content)
258             nav_map.append(nav_point)
259             counter = child.write_to_xml(nav_point, counter + 1)
260         return counter
261
262     def html_part(self, depth=0):
263         texts = []
264         for child in self.children:
265             texts.append(
266                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
267                 (depth, child.href(), child.name))
268             texts.append(child.html_part(depth+1))
269         return "\n".join(texts)
270
271     def html(self):
272         with open(get_resource('epub/toc.html')) as f:
273             t = unicode(f.read(), 'utf-8')
274         return t % self.html_part()
275
276
277 def used_chars(element):
278     """ Lists characters used in an ETree Element """
279     chars = set((element.text or '') + (element.tail or ''))
280     for child in element:
281         chars = chars.union(used_chars(child))
282     return chars
283
284
285 def chop(main_text):
286     """ divide main content of the XML file into chunks """
287
288     # prepare a container for each chunk
289     part_xml = etree.Element('utwor')
290     etree.SubElement(part_xml, 'master')
291     main_xml_part = part_xml[0] # master
292
293     last_node_part = False
294     
295     # the below loop are workaround for a problem with epubs in drama ebooks without acts
296     is_scene = False
297     is_act = False
298     for one_part in main_text:
299         name = one_part.tag
300         if name == 'naglowek_scena':
301             is_scene = True
302         elif name == 'naglowek_akt':
303             is_act = True
304     
305     for one_part in main_text:
306         name = one_part.tag
307         if is_act is False and is_scene is True:
308             if name == 'naglowek_czesc':
309                 yield part_xml
310                 last_node_part = True
311                 main_xml_part[:] = [deepcopy(one_part)]
312             elif not last_node_part and name in ("naglowek_scena"):
313                 yield part_xml
314                 main_xml_part[:] = [deepcopy(one_part)]
315             else:
316                 main_xml_part.append(deepcopy(one_part))
317                 last_node_part = False
318         else:
319             if name == 'naglowek_czesc':
320                 yield part_xml
321                 last_node_part = True
322                 main_xml_part[:] = [deepcopy(one_part)]
323             elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
324                 yield part_xml
325                 main_xml_part[:] = [deepcopy(one_part)]
326             else:
327                 main_xml_part.append(deepcopy(one_part))
328                 last_node_part = False            
329     yield part_xml
330
331
332 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
333     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
334
335     toc = TOC()
336     for element in chunk_xml[0]:
337         if element.tag in ("naglowek_czesc"):
338             toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
339         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
340             toc.add(node_name(element), "part%d.html" % chunk_no)
341         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
342             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
343             element.set('sub', str(subnumber))
344     if empty:
345         if not _empty_html_static:
346             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
347         chars = set()
348         output_html = _empty_html_static[0]
349     else:
350         find_annotations(annotations, chunk_xml, chunk_no)
351         replace_by_verse(chunk_xml)
352         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
353         chars = used_chars(html_tree.getroot())
354         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
355     return output_html, toc, chars
356
357
358 def transform(wldoc, verbose=False,
359               style=None, html_toc=False,
360               sample=None, cover=None, flags=None):
361     """ produces a EPUB file
362
363     sample=n: generate sample e-book (with at least n paragraphs)
364     cover: a cover.Cover factory or True for default
365     flags: less-advertising, without-fonts, working-copy, with-full-fonts
366     """
367
368     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
369         """ processes one input file and proceeds to its children """
370
371         replace_characters(wldoc.edoc.getroot())
372
373         # every input file will have a TOC entry,
374         # pointing to starting chunk
375         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
376         chars = set()
377         if first:
378             # write book title page
379             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
380             chars = used_chars(html_tree.getroot())
381             zip.writestr('OPS/title.html',
382                  etree.tostring(html_tree, method="html", pretty_print=True))
383             # add a title page TOC entry
384             toc.add(u"Strona tytułowa", "title.html")
385         elif wldoc.book_info.parts:
386             # write title page for every parent
387             if sample is not None and sample <= 0:
388                 chars = set()
389                 html_string = open(get_resource('epub/emptyChunk.html')).read()
390             else:
391                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
392                 chars = used_chars(html_tree.getroot())
393                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
394             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
395             add_to_manifest(manifest, chunk_counter)
396             add_to_spine(spine, chunk_counter)
397             chunk_counter += 1
398
399         if len(wldoc.edoc.getroot()) > 1:
400             # rdf before style master
401             main_text = wldoc.edoc.getroot()[1]
402         else:
403             # rdf in style master
404             main_text = wldoc.edoc.getroot()[0]
405             if main_text.tag == RDFNS('RDF'):
406                 main_text = None
407
408         if main_text is not None:
409             for chunk_xml in chop(main_text):
410                 empty = False
411                 if sample is not None:
412                     if sample <= 0:
413                         empty = True
414                     else:
415                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
416                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
417
418                 toc.extend(chunk_toc)
419                 chars = chars.union(chunk_chars)
420                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
421                 add_to_manifest(manifest, chunk_counter)
422                 add_to_spine(spine, chunk_counter)
423                 chunk_counter += 1
424
425         for child in wldoc.parts():
426             child_toc, chunk_counter, chunk_chars, sample = transform_file(
427                 child, chunk_counter, first=False, sample=sample)
428             toc.append(child_toc)
429             chars = chars.union(chunk_chars)
430
431         return toc, chunk_counter, chars, sample
432
433
434     document = deepcopy(wldoc)
435     del wldoc
436
437     if flags:
438         for flag in flags:
439             document.edoc.getroot().set(flag, 'yes')
440
441     # add editors info
442     document.edoc.getroot().set('editors', u', '.join(sorted(
443         editor.readable() for editor in document.editors())))
444     if document.book_info.funders:
445         document.edoc.getroot().set('funders', u', '.join(
446             document.book_info.funders))
447     if document.book_info.thanks:
448         document.edoc.getroot().set('thanks', document.book_info.thanks)
449
450     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
451     manifest = opf.find('.//' + OPFNS('manifest'))
452     guide = opf.find('.//' + OPFNS('guide'))
453     spine = opf.find('.//' + OPFNS('spine'))
454
455     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
456     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
457
458     # write static elements
459     mime = zipfile.ZipInfo()
460     mime.filename = 'mimetype'
461     mime.compress_type = zipfile.ZIP_STORED
462     mime.extra = ''
463     zip.writestr(mime, 'application/epub+zip')
464     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
465                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
466                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
467                        'media-type="application/oebps-package+xml" />' \
468                        '</rootfiles></container>')
469     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
470     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
471     if not style:
472         style = get_resource('epub/style.css')
473     zip.write(style, os.path.join('OPS', 'style.css'))
474
475     if cover:
476         if cover is True:
477             cover = DefaultEbookCover
478
479         cover_file = StringIO()
480         bound_cover = cover(document.book_info)
481         bound_cover.save(cover_file)
482         cover_name = 'cover.%s' % bound_cover.ext()
483         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
484         del cover_file
485
486         cover_tree = etree.parse(get_resource('epub/cover.html'))
487         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
488         zip.writestr('OPS/cover.html', etree.tostring(
489                         cover_tree, method="html", pretty_print=True))
490
491         if bound_cover.uses_dc_cover:
492             if document.book_info.cover_by:
493                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
494             if document.book_info.cover_source:
495                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
496
497         manifest.append(etree.fromstring(
498             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
499         manifest.append(etree.fromstring(
500             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
501         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
502         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
503         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
504
505
506     annotations = etree.Element('annotations')
507
508     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
509                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
510                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
511                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
512                                '</navMap></ncx>')
513     nav_map = toc_file[-1]
514
515     if html_toc:
516         manifest.append(etree.fromstring(
517             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
518         spine.append(etree.fromstring(
519             '<itemref idref="html_toc" />'))
520         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
521
522     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
523
524     if len(toc.children) < 2:
525         toc.add(u"Początek utworu", "part1.html")
526
527     # Last modifications in container files and EPUB creation
528     if len(annotations) > 0:
529         toc.add("Przypisy", "annotations.html")
530         manifest.append(etree.fromstring(
531             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
532         spine.append(etree.fromstring(
533             '<itemref idref="annotations" />'))
534         replace_by_verse(annotations)
535         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
536         chars = chars.union(used_chars(html_tree.getroot()))
537         zip.writestr('OPS/annotations.html', etree.tostring(
538                             html_tree, method="html", pretty_print=True))
539
540     toc.add("Wesprzyj Wolne Lektury", "support.html")
541     manifest.append(etree.fromstring(
542         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
543     spine.append(etree.fromstring(
544         '<itemref idref="support" />'))
545     html_string = open(get_resource('epub/support.html')).read()
546     chars.update(used_chars(etree.fromstring(html_string)))
547     zip.writestr('OPS/support.html', html_string)
548
549     toc.add("Strona redakcyjna", "last.html")
550     manifest.append(etree.fromstring(
551         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
552     spine.append(etree.fromstring(
553         '<itemref idref="last" />'))
554     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
555     chars.update(used_chars(html_tree.getroot()))
556     zip.writestr('OPS/last.html', etree.tostring(
557                         html_tree, method="html", pretty_print=True))
558
559     if not flags or not 'without-fonts' in flags:
560         # strip fonts
561         tmpdir = mkdtemp('-librarian-epub')
562         try:
563             cwd = os.getcwd()
564         except OSError:
565             cwd = None
566
567         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
568         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
569             if not flags or not 'with-full-fonts' in flags:
570                 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
571                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]              
572                 if verbose:
573                     print "Running font-optimizer"
574                     subprocess.check_call(optimizer_call)
575                 else:
576                     subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
577                     zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
578             else:
579                 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
580             manifest.append(etree.fromstring(
581                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
582         rmtree(tmpdir)
583         if cwd is not None:
584             os.chdir(cwd)
585     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
586     title = document.book_info.title
587     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
588     for st in attributes:
589         meta = toc_file.makeelement(NCXNS('meta'))
590         meta.set('name', st)
591         meta.set('content', '0')
592         toc_file[0].append(meta)
593     toc_file[0][0].set('content', str(document.book_info.url))
594     toc_file[0][1].set('content', str(toc.depth()))
595     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
596
597     # write TOC
598     if html_toc:
599         toc.add(u"Spis treści", "toc.html", index=1)
600         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
601     toc.write_to_xml(nav_map)
602     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
603     zip.close()
604
605     return OutputFile.from_filename(output_file.name)