New EPUB builder, other minor changes.
[librarian.git] / src / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import print_function, unicode_literals
7
8 import os
9 import os.path
10 import re
11 import subprocess
12 import six
13 from copy import deepcopy
14 from mimetypes import guess_type
15
16 from ebooklib import epub
17 from lxml import etree
18 from PIL import Image
19 from tempfile import mkdtemp, NamedTemporaryFile
20 from shutil import rmtree
21
22 from librarian import RDFNS, WLNS, DCNS, OutputFile
23 from librarian.cover import make_cover
24
25 from librarian import functions, get_resource
26
27 from librarian.hyphenator import Hyphenator
28
29 functions.reg_person_name()
30
31
32 def squeeze_whitespace(s):
33     return s
34     return re.sub(b'\\s+', b' ', s)
35
36
37 def set_hyph_language(source_tree):
38     bibl_lng = etree.XPath('//dc:language//text()',
39                            namespaces={'dc': str(DCNS)})(source_tree)
40     short_lng = functions.lang_code_3to2(bibl_lng[0])
41     try:
42         return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
43                                        short_lng + '.dic'))
44     except:
45         pass
46
47
48 def hyphenate_and_fix_conjunctions(source_tree, hyph):
49     texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
50     for t in texts:
51         parent = t.getparent()
52         if hyph is not None:
53             newt = ''
54             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
55             for w in wlist:
56                 newt += hyph.inserted(w, u'\u00AD')
57         else:
58             newt = t
59         newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
60         if t.is_text:
61             parent.text = newt
62         elif t.is_tail:
63             parent.tail = newt
64
65
66 def node_name(node):
67     """ Find out a node's name
68
69     >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
70     XYZ
71     """
72
73     tempnode = deepcopy(node)
74
75     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
76         for e in tempnode.findall('.//%s' % p):
77             t = e.tail
78             e.clear()
79             e.tail = t
80     etree.strip_tags(tempnode, '*')
81     return tempnode.text
82
83
84 def xslt(xml, sheet, **kwargs):
85     if isinstance(xml, etree._Element):
86         xml = etree.ElementTree(xml)
87     with open(sheet) as xsltf:
88         transform = etree.XSLT(etree.parse(xsltf))
89         params = dict(
90             (key, transform.strparam(value))
91             for key, value in kwargs.items()
92         )
93         return transform(xml, **params)
94
95
96 def replace_characters(node):
97     def replace_chars(text):
98         if text is None:
99             return None
100         return text.replace(u"\ufeff", u"")\
101                    .replace("---", u"\u2014")\
102                    .replace("--", u"\u2013")\
103                    .replace(",,", u"\u201E")\
104                    .replace('"', u"\u201D")\
105                    .replace("'", u"\u2019")
106     if node.tag in ('uwaga', 'extra'):
107         t = node.tail
108         node.clear()
109         node.tail = t
110     node.text = replace_chars(node.text)
111     node.tail = replace_chars(node.tail)
112     for child in node:
113         replace_characters(child)
114
115
116 def find_annotations(annotations, source, part_no):
117     for child in source:
118         if child.tag in ('pe', 'pa', 'pt', 'pr'):
119             annotation = deepcopy(child)
120             number = str(len(annotations) + 1)
121             annotation.set('number', number)
122             annotation.set('part', str(part_no))
123             annotation.tail = ''
124             annotations.append(annotation)
125             tail = child.tail
126             child.clear()
127             child.tail = tail
128             child.text = number
129         if child.tag not in ('extra', 'uwaga'):
130             find_annotations(annotations, child, part_no)
131
132
133 class Stanza(object):
134     """
135     Converts / verse endings into verse elements in a stanza.
136
137     Slashes may only occur directly in the stanza. Any slashes in subelements
138     will be ignored, and the subelements will be put inside verse elements.
139
140     >>> s = etree.fromstring(
141     ...         "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
142     ...     )
143     >>> Stanza(s).versify()
144     >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
145     <strofa>
146       <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
147       <wers_normalny>b<x>x/
148     y</x>c</wers_normalny>
149       <wers_normalny>d</wers_normalny>
150     </strofa>
151
152     """
153     def __init__(self, stanza_elem):
154         self.stanza = stanza_elem
155         self.verses = []
156         self.open_verse = None
157
158     def versify(self):
159         self.push_text(self.stanza.text)
160         for elem in self.stanza:
161             self.push_elem(elem)
162             self.push_text(elem.tail)
163         tail = self.stanza.tail
164         self.stanza.clear()
165         self.stanza.tail = tail
166         self.stanza.extend(
167             verse for verse in self.verses
168             if verse.text or len(verse) > 0
169         )
170
171     def open_normal_verse(self):
172         self.open_verse = self.stanza.makeelement("wers_normalny")
173         self.verses.append(self.open_verse)
174
175     def get_open_verse(self):
176         if self.open_verse is None:
177             self.open_normal_verse()
178         return self.open_verse
179
180     def push_text(self, text):
181         if not text:
182             return
183         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
184             if i:
185                 self.open_normal_verse()
186             if not verse_text.strip():
187                 continue
188             verse = self.get_open_verse()
189             if len(verse):
190                 verse[-1].tail = (verse[-1].tail or "") + verse_text
191             else:
192                 verse.text = (verse.text or "") + verse_text
193
194     def push_elem(self, elem):
195         if elem.tag.startswith("wers"):
196             verse = deepcopy(elem)
197             verse.tail = None
198             self.verses.append(verse)
199             self.open_verse = verse
200         else:
201             appended = deepcopy(elem)
202             appended.tail = None
203             self.get_open_verse().append(appended)
204
205
206 def replace_by_verse(tree):
207     """ Find stanzas and create new verses in place of a '/' character """
208
209     stanzas = tree.findall('.//' + WLNS('strofa'))
210     for stanza in stanzas:
211         Stanza(stanza).versify()
212
213
214 def used_chars(element):
215     """ Lists characters used in an ETree Element """
216     chars = set((element.text or '') + (element.tail or ''))
217     for child in element:
218         chars = chars.union(used_chars(child))
219     return chars
220
221
222 def chop(main_text):
223     """ divide main content of the XML file into chunks """
224
225     # prepare a container for each chunk
226     part_xml = etree.Element('utwor')
227     etree.SubElement(part_xml, 'master')
228     main_xml_part = part_xml[0]  # master
229
230     last_node_part = False
231
232     # The below loop are workaround for a problem with epubs
233     # in drama ebooks without acts.
234     is_scene = False
235     is_act = False
236     for one_part in main_text:
237         name = one_part.tag
238         if name == 'naglowek_scena':
239             is_scene = True
240         elif name == 'naglowek_akt':
241             is_act = True
242
243     for one_part in main_text:
244         name = one_part.tag
245         if is_act is False and is_scene is True:
246             if name == 'naglowek_czesc':
247                 yield part_xml
248                 last_node_part = True
249                 main_xml_part[:] = [deepcopy(one_part)]
250             elif not last_node_part and name == "naglowek_scena":
251                 yield part_xml
252                 main_xml_part[:] = [deepcopy(one_part)]
253             else:
254                 main_xml_part.append(deepcopy(one_part))
255                 last_node_part = False
256         else:
257             if name == 'naglowek_czesc':
258                 yield part_xml
259                 last_node_part = True
260                 main_xml_part[:] = [deepcopy(one_part)]
261             elif (not last_node_part
262                   and name in (
263                       "naglowek_rozdzial", "naglowek_akt", "srodtytul"
264                   )):
265                 yield part_xml
266                 main_xml_part[:] = [deepcopy(one_part)]
267             else:
268                 main_xml_part.append(deepcopy(one_part))
269                 last_node_part = False
270     yield part_xml
271
272
273 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
274                     _empty_html_static=[]):
275     """
276     Transforms one chunk, returns a HTML string, a TOC object
277     and a set of used characters.
278     """
279
280     toc = []
281     for element in chunk_xml[0]:
282         if element.tag == "naglowek_czesc":
283             toc.append(
284                 (
285                     epub.Link(
286                         "part%d.xhtml#book-text" % chunk_no,
287                         node_name(element),
288                         "part%d-text" % chunk_no
289                     ),
290                     []
291                 )
292             )
293         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
294             toc.append(
295                 (
296                     epub.Link(
297                         "part%d.xhtml" % chunk_no,
298                         node_name(element),
299                         "part%d" % chunk_no
300                     ),
301                     []
302                 )
303             )
304         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
305             if not toc:
306                 toc.append(
307                     (
308                         epub.Link(
309                             "part%d.xhtml" % chunk_no,
310                             " ",
311                             "part%d" % chunk_no
312                         ),
313                         []
314                     )
315                 )
316
317             subnumber = len(toc[-1][1])
318             toc[-1][1].append(
319                 epub.Link(
320                     "part%d.xhtml#sub%d" % (chunk_no, subnumber),
321                     node_name(element),
322                     "part%d-sub%d" % (chunk_no, subnumber)
323                 )
324             )
325             element.set('sub', six.text_type(subnumber))
326     if empty:
327         if not _empty_html_static:
328             with open(get_resource('epub/emptyChunk.xhtml')) as f:
329                 _empty_html_static.append(f.read())
330         chars = set()
331         output_html = _empty_html_static[0]
332     else:
333         find_annotations(annotations, chunk_xml, chunk_no)
334         replace_by_verse(chunk_xml)
335         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
336         chars = used_chars(html_tree.getroot())
337         output_html = etree.tostring(
338             html_tree, pretty_print=True, xml_declaration=True,
339             encoding="utf-8",
340             doctype='<!DOCTYPE html>'
341         )
342     return output_html, toc, chars
343
344
345 def remove_empty_lists_from_toc(toc):
346     for i, e in enumerate(toc):
347         if isinstance(e, tuple):
348             if e[1]:
349                 remove_empty_lists_from_toc(e[1])
350             else:
351                 toc[i] = e[0]
352
353
354
355 def transform_file(wldoc, chunk_counter=1, first=True, sample=None, hyphenate=False, output_type='epub', spine=None, output=None, annotations=None):
356         """ processes one input file and proceeds to its children """
357
358         replace_characters(wldoc.edoc.getroot())
359
360         hyphenator = set_hyph_language(
361             wldoc.edoc.getroot()
362         ) if hyphenate else None
363         hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
364
365         # every input file will have a TOC entry,
366         # pointing to starting chunk
367         toc = [
368             (
369                 epub.Link(
370                     "part%d.xhtml" % chunk_counter,
371                     wldoc.book_info.title,
372                     "path%d-start" % chunk_counter
373                 ),
374                 []
375             )
376         ]
377         chars = set()
378         if first:
379             # write book title page
380             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
381                              outputtype=output_type)
382             chars = used_chars(html_tree.getroot())
383             html_string = etree.tostring(
384                 html_tree, pretty_print=True, xml_declaration=True,
385                 encoding="utf-8",
386                 doctype='<!DOCTYPE html>'
387             )
388             item = epub.EpubItem(
389                 uid="titlePage",
390                 file_name="title.xhtml",
391                 media_type="application/xhtml+xml",
392                 content=squeeze_whitespace(html_string)
393             )
394             spine.append(item)
395             output.add_item(item)
396             # add a title page TOC entry
397             toc[-1][1].append(
398                 epub.Link(
399                     "title.xhtml",
400                     "Strona tytułowa",
401                     "title",
402                 )
403             )
404
405             item = epub.EpubNav()
406             toc[-1][1].append(
407                 epub.Link(
408                     "nav.xhtml",
409                     "Spis treści",
410                     "nav"
411                 )
412             )
413             output.add_item(item)
414             spine.append(item)
415
416             toc[-1][1].append(
417                 epub.Link(
418                     "part1.xhtml",
419                     "Początek utworu",
420                     "part1"
421                 )
422             )
423
424         elif wldoc.book_info.parts:
425             # write title page for every parent
426             if sample is not None and sample <= 0:
427                 chars = set()
428                 html_string = open(
429                     get_resource('epub/emptyChunk.xhtml')).read()
430             else:
431                 html_tree = xslt(wldoc.edoc,
432                                  get_resource('epub/xsltChunkTitle.xsl'))
433                 chars = used_chars(html_tree.getroot())
434                 html_string = etree.tostring(
435                     html_tree, pretty_print=True, xml_declaration=True,
436                     encoding="utf-8",
437                     doctype='<!DOCTYPE html>'
438                 )
439             item = epub.EpubItem(
440                 uid="part%d" % chunk_counter,
441                 file_name="part%d.xhtml" % chunk_counter,
442                 media_type="application/xhtml+xml",
443                 content=squeeze_whitespace(html_string)
444             )
445             output.add_item(item)
446             spine.append(item)
447
448             chunk_counter += 1
449
450         if len(wldoc.edoc.getroot()) > 1:
451             # rdf before style master
452             main_text = wldoc.edoc.getroot()[1]
453         else:
454             # rdf in style master
455             main_text = wldoc.edoc.getroot()[0]
456             if main_text.tag == RDFNS('RDF'):
457                 main_text = None
458
459         if main_text is not None:
460             for chunk_xml in chop(main_text):
461                 empty = False
462                 if sample is not None:
463                     if sample <= 0:
464                         empty = True
465                     else:
466                         sample -= len(chunk_xml.xpath(
467                             '//strofa|//akap|//akap_cd|//akap_dialog'
468                         ))
469                 chunk_html, chunk_toc, chunk_chars = transform_chunk(
470                     chunk_xml, chunk_counter, annotations, empty)
471
472                 toc[-1][1].extend(chunk_toc)
473                 chars = chars.union(chunk_chars)
474                 item = epub.EpubItem(
475                     uid="part%d" % chunk_counter,
476                     file_name="part%d.xhtml" % chunk_counter,
477                     media_type="application/xhtml+xml",
478                     content=squeeze_whitespace(chunk_html)
479                 )
480                 output.add_item(item)
481                 spine.append(item)
482                 chunk_counter += 1
483
484         for child in wldoc.parts():
485             child_toc, chunk_counter, chunk_chars, sample = transform_file(
486                 child, chunk_counter, first=False, sample=sample,
487                 hyphenate=hyphenate, output_type=output_type,
488                 spine=spine, output=output, annotations=annotations,
489             )
490             toc[-1][1].extend(child_toc)
491             chars = chars.union(chunk_chars)
492
493         return toc, chunk_counter, chars, sample
494
495                 
496 def transform(wldoc, verbose=False, style=None,
497               sample=None, cover=None, flags=None, hyphenate=False,
498               base_url='file://./', output_type='epub'):
499     """ produces a EPUB file
500
501     sample=n: generate sample e-book (with at least n paragraphs)
502     cover: a cover.Cover factory or True for default
503     flags: less-advertising, without-fonts, working-copy
504     """
505
506
507     document = deepcopy(wldoc)
508     del wldoc
509
510     if flags:
511         for flag in flags:
512             document.edoc.getroot().set(flag, 'yes')
513
514     document.clean_ed_note()
515     document.clean_ed_note('abstrakt')
516
517     # add editors info
518     editors = document.editors()
519     if editors:
520         document.edoc.getroot().set('editors', u', '.join(sorted(
521             editor.readable() for editor in editors)))
522     if document.book_info.funders:
523         document.edoc.getroot().set('funders', u', '.join(
524             document.book_info.funders))
525     if document.book_info.thanks:
526         document.edoc.getroot().set('thanks', document.book_info.thanks)
527
528     output = epub.EpubBook()
529     output.set_identifier(six.text_type(document.book_info.url))
530     output.set_language(functions.lang_code_3to2(document.book_info.language))
531     output.set_title(document.book_info.title)
532     for i, author in enumerate(document.book_info.authors):
533         output.add_author(
534             author.readable(),
535             file_as=six.text_type(author),
536             uid='creator{}'.format(i)
537         )
538     for translator in document.book_info.translators:
539         output.add_author(
540             translator.readable(),
541             file_as=six.text_type(translator),
542             role='trl',
543             uid='translator{}'.format(i)
544         )
545     for publisher in document.book_info.publisher:
546         output.add_metadata("DC", "publisher", publisher)
547     output.add_metadata("DC", "date", document.book_info.created_at)
548
549     output.guide.append({
550         "type": "text",
551         "title": "Początek",
552         "href": "part1.xhtml"
553     })
554
555     output.add_item(epub.EpubNcx())
556
557     spine = output.spine
558
559     functions.reg_mathml_epub(output)
560
561     # FIXME
562     for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
563         url = six.moves.urllib.parse.urljoin(
564             base_url,
565             ilustr.get('src')
566         )
567         imgfile = six.moves.urllib.request.urlopen(url)
568         img = Image.open(imgfile)
569
570         th_format, ext, media_type = {
571             'GIF': ('GIF', 'gif', 'image/gif'),
572             'PNG': ('PNG', 'png', 'image/png'),
573         }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
574
575         width = 1200
576         if img.size[0] < width:
577             th = img
578         else:
579             th = img.resize((width, round(width * img.size[1] / img.size[0])))
580
581         imgfile.close()
582             
583         buffer = six.BytesIO()
584         th.save(buffer, format=th_format)
585
586         file_name = 'image%d.%s' % (i, ext)
587         ilustr.set('src', file_name)
588         output.add_item(
589             epub.EpubItem(
590                 uid='image%s' % i,
591                 file_name=file_name,
592                 media_type=media_type,
593                 content=buffer.getvalue()
594             )
595         )
596             
597     # write static elements
598
599     with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
600         output.add_item(
601             epub.EpubItem(
602                 uid="logo_wolnelektury.png",
603                 file_name="logo_wolnelektury.png",
604                 media_type="image/png",
605                 content=f.read()
606             )
607         )
608     with open(get_resource('res/jedenprocent.png'), 'rb') as f:
609         output.add_item(
610             epub.EpubItem(
611                 uid="jedenprocent",
612                 file_name="jedenprocent.png",
613                 media_type="image/png",
614                 content=f.read()
615             )
616         )
617
618     if not style:
619         style = get_resource('epub/style.css')
620     with open(style, 'rb') as f:
621         output.add_item(
622             epub.EpubItem(
623                 uid="style",
624                 file_name="style.css",
625                 media_type="text/css",
626                 content=f.read()
627             )
628         )
629
630     if cover:
631         if cover is True:
632             cover = make_cover
633
634         cover_file = six.BytesIO()
635         bound_cover = cover(document.book_info)
636         bound_cover.save(cover_file)
637         cover_name = 'cover.%s' % bound_cover.ext()
638
639         output.set_cover(
640             file_name=cover_name,
641             content=cover_file.getvalue(),
642         )
643         spine.append('cover')
644         output.guide.append({
645             "type": "cover",
646             "href": "cover.xhtml",
647             "title": "Okładka",
648         })
649
650         del cover_file
651
652         if bound_cover.uses_dc_cover:
653             if document.book_info.cover_by:
654                 document.edoc.getroot().set('data-cover-by',
655                                             document.book_info.cover_by)
656             if document.book_info.cover_source:
657                 document.edoc.getroot().set('data-cover-source',
658                                             document.book_info.cover_source)
659
660     annotations = etree.Element('annotations')
661
662     toc, chunk_counter, chars, sample = transform_file(
663         document, sample=sample,
664         hyphenate=hyphenate, output_type=output_type,
665         spine=spine, output=output, annotations=annotations
666     )
667     output.toc = toc[0][1]
668
669     # Last modifications in container files and EPUB creation
670     if len(annotations) > 0:
671         output.toc.append(
672             epub.Link(
673                 "annotations.xhtml",
674                 "Przypisy",
675                 "annotations"
676             )
677         )
678         replace_by_verse(annotations)
679         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
680         chars = chars.union(used_chars(html_tree.getroot()))
681
682         item = epub.EpubItem(
683             uid="annotations",
684             file_name="annotations.xhtml",
685             media_type="application/xhtml+xml",
686             content=etree.tostring(
687                 html_tree, pretty_print=True, xml_declaration=True,
688                 encoding="utf-8",
689                 doctype='<!DOCTYPE html>'
690             )
691         )
692         output.add_item(item)
693         spine.append(item)
694
695     output.toc.append(
696         epub.Link(
697             "support.xhtml",
698             "Wesprzyj Wolne Lektury",
699             "support"
700         )
701     )
702     with open(get_resource('epub/support.xhtml'), 'rb') as f:
703         html_string = f.read()
704     chars.update(used_chars(etree.fromstring(html_string)))
705     item = epub.EpubItem(
706         uid="support",
707         file_name="support.xhtml",
708         media_type="application/xhtml+xml",
709         content=squeeze_whitespace(html_string)
710     )
711     output.add_item(item)
712     spine.append(item)
713
714     output.toc.append(
715         epub.Link(
716             "last.xhtml",
717             "Strona redakcyjna",
718             "last"
719         )
720     )
721     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
722                      outputtype=output_type)
723     chars.update(used_chars(html_tree.getroot()))
724     item = epub.EpubItem(
725         uid="last",
726         file_name="last.xhtml",
727         media_type="application/xhtml+xml",
728         content=squeeze_whitespace(etree.tostring(
729             html_tree, pretty_print=True, xml_declaration=True,
730             encoding="utf-8",
731             doctype='<!DOCTYPE html>'
732         ))
733     )
734     output.add_item(item)
735     spine.append(item)
736
737     if not flags or 'without-fonts' not in flags:
738         # strip fonts
739         tmpdir = mkdtemp('-librarian-epub')
740         try:
741             cwd = os.getcwd()
742         except OSError:
743             cwd = None
744
745         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
746                               'font-optimizer'))
747         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
748                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
749             optimizer_call = ['perl', 'subset.pl', '--chars',
750                               ''.join(chars).encode('utf-8'),
751                               get_resource('fonts/' + fname),
752                               os.path.join(tmpdir, fname)]
753             env = {"PERL_USE_UNSAFE_INC": "1"}
754             if verbose:
755                 print("Running font-optimizer")
756                 subprocess.check_call(optimizer_call, env=env)
757             else:
758                 dev_null = open(os.devnull, 'w')
759                 subprocess.check_call(optimizer_call, stdout=dev_null,
760                                       stderr=dev_null, env=env)
761             with open(os.path.join(tmpdir, fname), 'rb') as f:
762                 output.add_item(
763                     epub.EpubItem(
764                         uid=fname,
765                         file_name=fname,
766                         media_type="font/ttf",
767                         content=f.read()
768                     )
769                 )
770         rmtree(tmpdir)
771         if cwd is not None:
772             os.chdir(cwd)
773
774     remove_empty_lists_from_toc(output.toc)
775     print(output.toc)
776
777     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
778                                      delete=False)
779     output_file.close()
780     epub.write_epub(output_file.name, output, {'epub3_landmark': False})
781     return OutputFile.from_filename(output_file.name)