Content warnings.
[librarian.git] / src / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import print_function, unicode_literals
7
8 import os
9 import os.path
10 import re
11 import subprocess
12 import six
13 from copy import deepcopy
14 from mimetypes import guess_type
15
16 from ebooklib import epub
17 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
20
21 from librarian import RDFNS, WLNS, DCNS, OutputFile
22 from librarian.cover import make_cover
23
24 from librarian import functions, get_resource
25
26 from librarian.hyphenator import Hyphenator
27
28 functions.reg_person_name()
29
30
31 def squeeze_whitespace(s):
32     return re.sub(b'\\s+', b' ', s)
33
34
35 def set_hyph_language(source_tree):
36     bibl_lng = etree.XPath('//dc:language//text()',
37                            namespaces={'dc': str(DCNS)})(source_tree)
38     short_lng = functions.lang_code_3to2(bibl_lng[0])
39     try:
40         return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
41                                        short_lng + '.dic'))
42     except:
43         pass
44
45
46 def hyphenate_and_fix_conjunctions(source_tree, hyph):
47     texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
48     for t in texts:
49         parent = t.getparent()
50         if hyph is not None:
51             newt = ''
52             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
53             for w in wlist:
54                 newt += hyph.inserted(w, u'\u00AD')
55         else:
56             newt = t
57         newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
58         if t.is_text:
59             parent.text = newt
60         elif t.is_tail:
61             parent.tail = newt
62
63
64 def inner_xml(node):
65     """ returns node's text and children as a string
66
67     >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
68     x<b>y</b>z
69     """
70
71     nt = node.text if node.text is not None else ''
72     return ''.join(
73         [nt] + [etree.tostring(child, encoding='unicode') for child in node]
74     )
75
76
77 def set_inner_xml(node, text):
78     """ sets node's text and children from a string
79
80     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
81     >>> set_inner_xml(e, 'x<b>y</b>z')
82     >>> print(etree.tostring(e, encoding='unicode'))
83     <a>x<b>y</b>z</a>
84     """
85
86     p = etree.fromstring('<x>%s</x>' % text)
87     node.text = p.text
88     node[:] = p[:]
89
90
91 def node_name(node):
92     """ Find out a node's name
93
94     >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
95     XYZ
96     """
97
98     tempnode = deepcopy(node)
99
100     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
101         for e in tempnode.findall('.//%s' % p):
102             t = e.tail
103             e.clear()
104             e.tail = t
105     etree.strip_tags(tempnode, '*')
106     return tempnode.text
107
108
109 def xslt(xml, sheet, **kwargs):
110     if isinstance(xml, etree._Element):
111         xml = etree.ElementTree(xml)
112     with open(sheet) as xsltf:
113         transform = etree.XSLT(etree.parse(xsltf))
114         params = dict(
115             (key, transform.strparam(value))
116             for key, value in kwargs.items()
117         )
118         return transform(xml, **params)
119
120
121 def replace_characters(node):
122     def replace_chars(text):
123         if text is None:
124             return None
125         return text.replace(u"\ufeff", u"")\
126                    .replace("---", u"\u2014")\
127                    .replace("--", u"\u2013")\
128                    .replace(",,", u"\u201E")\
129                    .replace('"', u"\u201D")\
130                    .replace("'", u"\u2019")
131     if node.tag in ('uwaga', 'extra'):
132         t = node.tail
133         node.clear()
134         node.tail = t
135     node.text = replace_chars(node.text)
136     node.tail = replace_chars(node.tail)
137     for child in node:
138         replace_characters(child)
139
140
141 def find_annotations(annotations, source, part_no):
142     for child in source:
143         if child.tag in ('pe', 'pa', 'pt', 'pr'):
144             annotation = deepcopy(child)
145             number = str(len(annotations) + 1)
146             annotation.set('number', number)
147             annotation.set('part', str(part_no))
148             annotation.tail = ''
149             annotations.append(annotation)
150             tail = child.tail
151             child.clear()
152             child.tail = tail
153             child.text = number
154         if child.tag not in ('extra', 'uwaga'):
155             find_annotations(annotations, child, part_no)
156
157
158 class Stanza(object):
159     """
160     Converts / verse endings into verse elements in a stanza.
161
162     Slashes may only occur directly in the stanza. Any slashes in subelements
163     will be ignored, and the subelements will be put inside verse elements.
164
165     >>> s = etree.fromstring(
166     ...         "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
167     ...     )
168     >>> Stanza(s).versify()
169     >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
170     <strofa>
171       <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
172       <wers_normalny>b<x>x/
173     y</x>c</wers_normalny>
174       <wers_normalny>d</wers_normalny>
175     </strofa>
176
177     """
178     def __init__(self, stanza_elem):
179         self.stanza = stanza_elem
180         self.verses = []
181         self.open_verse = None
182
183     def versify(self):
184         self.push_text(self.stanza.text)
185         for elem in self.stanza:
186             self.push_elem(elem)
187             self.push_text(elem.tail)
188         tail = self.stanza.tail
189         self.stanza.clear()
190         self.stanza.tail = tail
191         self.stanza.extend(
192             verse for verse in self.verses
193             if verse.text or len(verse) > 0
194         )
195
196     def open_normal_verse(self):
197         self.open_verse = self.stanza.makeelement("wers_normalny")
198         self.verses.append(self.open_verse)
199
200     def get_open_verse(self):
201         if self.open_verse is None:
202             self.open_normal_verse()
203         return self.open_verse
204
205     def push_text(self, text):
206         if not text:
207             return
208         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
209             if i:
210                 self.open_normal_verse()
211             if not verse_text.strip():
212                 continue
213             verse = self.get_open_verse()
214             if len(verse):
215                 verse[-1].tail = (verse[-1].tail or "") + verse_text
216             else:
217                 verse.text = (verse.text or "") + verse_text
218
219     def push_elem(self, elem):
220         if elem.tag.startswith("wers"):
221             verse = deepcopy(elem)
222             verse.tail = None
223             self.verses.append(verse)
224             self.open_verse = verse
225         else:
226             appended = deepcopy(elem)
227             appended.tail = None
228             self.get_open_verse().append(appended)
229
230
231 def replace_by_verse(tree):
232     """ Find stanzas and create new verses in place of a '/' character """
233
234     stanzas = tree.findall('.//' + WLNS('strofa'))
235     for stanza in stanzas:
236         Stanza(stanza).versify()
237
238
239 def used_chars(element):
240     """ Lists characters used in an ETree Element """
241     chars = set((element.text or '') + (element.tail or ''))
242     for child in element:
243         chars = chars.union(used_chars(child))
244     return chars
245
246
247 def chop(main_text):
248     """ divide main content of the XML file into chunks """
249
250     # prepare a container for each chunk
251     part_xml = etree.Element('utwor')
252     etree.SubElement(part_xml, 'master')
253     main_xml_part = part_xml[0]  # master
254
255     last_node_part = False
256
257     # The below loop are workaround for a problem with epubs
258     # in drama ebooks without acts.
259     is_scene = False
260     is_act = False
261     for one_part in main_text:
262         name = one_part.tag
263         if name == 'naglowek_scena':
264             is_scene = True
265         elif name == 'naglowek_akt':
266             is_act = True
267
268     for one_part in main_text:
269         name = one_part.tag
270         if is_act is False and is_scene is True:
271             if name == 'naglowek_czesc':
272                 yield part_xml
273                 last_node_part = True
274                 main_xml_part[:] = [deepcopy(one_part)]
275             elif not last_node_part and name == "naglowek_scena":
276                 yield part_xml
277                 main_xml_part[:] = [deepcopy(one_part)]
278             else:
279                 main_xml_part.append(deepcopy(one_part))
280                 last_node_part = False
281         else:
282             if name == 'naglowek_czesc':
283                 yield part_xml
284                 last_node_part = True
285                 main_xml_part[:] = [deepcopy(one_part)]
286             elif (not last_node_part
287                   and name in (
288                       "naglowek_rozdzial", "naglowek_akt", "srodtytul"
289                   )):
290                 yield part_xml
291                 main_xml_part[:] = [deepcopy(one_part)]
292             else:
293                 main_xml_part.append(deepcopy(one_part))
294                 last_node_part = False
295     yield part_xml
296
297
298 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
299                     _empty_html_static=[]):
300     """
301     Transforms one chunk, returns a HTML string, a TOC object
302     and a set of used characters.
303     """
304
305     toc = []
306     for element in chunk_xml[0]:
307         if element.tag == "naglowek_czesc":
308             toc.append(
309                 (
310                     epub.Link(
311                         "part%d.xhtml#book-text" % chunk_no,
312                         node_name(element),
313                         "part%d-text" % chunk_no
314                     ),
315                     []
316                 )
317             )
318         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
319             toc.append(
320                 (
321                     epub.Link(
322                         "part%d.xhtml" % chunk_no,
323                         node_name(element),
324                         "part%d" % chunk_no
325                     ),
326                     []
327                 )
328             )
329         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
330             if not toc:
331                 toc.append(
332                     (
333                         epub.Link(
334                             "part%d.xhtml" % chunk_no,
335                             " ",
336                             "part%d" % chunk_no
337                         ),
338                         []
339                     )
340                 )
341
342             subnumber = len(toc[-1][1])
343             toc[-1][1].append(
344                 epub.Link(
345                     "part%d.xhtml#sub%d" % (chunk_no, subnumber),
346                     node_name(element),
347                     "part%d-sub%d" % (chunk_no, subnumber)
348                 )
349             )
350             element.set('sub', six.text_type(subnumber))
351     if empty:
352         if not _empty_html_static:
353             with open(get_resource('epub/emptyChunk.xhtml')) as f:
354                 _empty_html_static.append(f.read())
355         chars = set()
356         output_html = _empty_html_static[0]
357     else:
358         find_annotations(annotations, chunk_xml, chunk_no)
359         replace_by_verse(chunk_xml)
360         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
361         chars = used_chars(html_tree.getroot())
362         output_html = etree.tostring(
363             html_tree, pretty_print=True, xml_declaration=True,
364             encoding="utf-8",
365             doctype='<!DOCTYPE html>'
366         )
367     return output_html, toc, chars
368
369
370 def remove_empty_lists_from_toc(toc):
371     for i, e in enumerate(toc):
372         if isinstance(e, tuple):
373             if e[1]:
374                 remove_empty_lists_from_toc(e[1])
375             else:
376                 toc[i] = e[0]
377
378
379 def transform(wldoc, verbose=False, style=None,
380               sample=None, cover=None, flags=None, hyphenate=False,
381               ilustr_path='', output_type='epub'):
382     """ produces a EPUB file
383
384     sample=n: generate sample e-book (with at least n paragraphs)
385     cover: a cover.Cover factory or True for default
386     flags: less-advertising, without-fonts, working-copy
387     """
388
389     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
390         """ processes one input file and proceeds to its children """
391
392         replace_characters(wldoc.edoc.getroot())
393
394         hyphenator = set_hyph_language(
395             wldoc.edoc.getroot()
396         ) if hyphenate else None
397         hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
398
399         # every input file will have a TOC entry,
400         # pointing to starting chunk
401         toc = [
402             (
403                 epub.Link(
404                     "part%d.xhtml" % chunk_counter,
405                     wldoc.book_info.title,
406                     "path%d-start" % chunk_counter
407                 ),
408                 []
409             )
410         ]
411         chars = set()
412         if first:
413             # write book title page
414             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
415                              outputtype=output_type)
416             chars = used_chars(html_tree.getroot())
417             html_string = etree.tostring(
418                 html_tree, pretty_print=True, xml_declaration=True,
419                 encoding="utf-8",
420                 doctype='<!DOCTYPE html>'
421             )
422             item = epub.EpubItem(
423                 uid="titlePage",
424                 file_name="title.xhtml",
425                 media_type="application/xhtml+xml",
426                 content=squeeze_whitespace(html_string)
427             )
428             spine.append(item)
429             output.add_item(item)
430             # add a title page TOC entry
431             toc[-1][1].append(
432                 epub.Link(
433                     "title.xhtml",
434                     "Strona tytułowa",
435                     "title",
436                 )
437             )
438
439             item = epub.EpubNav()
440             toc[-1][1].append(
441                 epub.Link(
442                     "nav.xhtml",
443                     "Spis treści",
444                     "nav"
445                 )
446             )
447             output.add_item(item)
448             spine.append(item)
449
450         elif wldoc.book_info.parts:
451             # write title page for every parent
452             if sample is not None and sample <= 0:
453                 chars = set()
454                 html_string = open(
455                     get_resource('epub/emptyChunk.xhtml')).read()
456             else:
457                 html_tree = xslt(wldoc.edoc,
458                                  get_resource('epub/xsltChunkTitle.xsl'))
459                 chars = used_chars(html_tree.getroot())
460                 html_string = etree.tostring(
461                     html_tree, pretty_print=True, xml_declaration=True,
462                     encoding="utf-8",
463                     doctype='<!DOCTYPE html>'
464                 )
465             item = epub.EpubItem(
466                 uid="part%d" % chunk_counter,
467                 file_name="part%d.xhtml" % chunk_counter,
468                 media_type="application/xhtml+xml",
469                 content=squeeze_whitespace(html_string)
470             )
471             output.add_item(item)
472             spine.append(item)
473
474             chunk_counter += 1
475
476         if len(wldoc.edoc.getroot()) > 1:
477             # rdf before style master
478             main_text = wldoc.edoc.getroot()[1]
479         else:
480             # rdf in style master
481             main_text = wldoc.edoc.getroot()[0]
482             if main_text.tag == RDFNS('RDF'):
483                 main_text = None
484
485         if main_text is not None:
486             for chunk_xml in chop(main_text):
487                 empty = False
488                 if sample is not None:
489                     if sample <= 0:
490                         empty = True
491                     else:
492                         sample -= len(chunk_xml.xpath(
493                             '//strofa|//akap|//akap_cd|//akap_dialog'
494                         ))
495                 chunk_html, chunk_toc, chunk_chars = transform_chunk(
496                     chunk_xml, chunk_counter, annotations, empty)
497
498                 toc[-1][1].extend(chunk_toc)
499                 chars = chars.union(chunk_chars)
500                 item = epub.EpubItem(
501                     uid="part%d" % chunk_counter,
502                     file_name="part%d.xhtml" % chunk_counter,
503                     media_type="application/xhtml+xml",
504                     content=squeeze_whitespace(chunk_html)
505                 )
506                 output.add_item(item)
507                 spine.append(item)
508                 chunk_counter += 1
509
510         for child in wldoc.parts():
511             child_toc, chunk_counter, chunk_chars, sample = transform_file(
512                 child, chunk_counter, first=False, sample=sample)
513             toc[-1][1].extend(child_toc)
514             chars = chars.union(chunk_chars)
515
516         return toc, chunk_counter, chars, sample
517
518     document = deepcopy(wldoc)
519     del wldoc
520
521     if flags:
522         for flag in flags:
523             document.edoc.getroot().set(flag, 'yes')
524
525     document.clean_ed_note()
526     document.clean_ed_note('abstrakt')
527
528     # add editors info
529     editors = document.editors()
530     if editors:
531         document.edoc.getroot().set('editors', u', '.join(sorted(
532             editor.readable() for editor in editors)))
533     if document.book_info.funders:
534         document.edoc.getroot().set('funders', u', '.join(
535             document.book_info.funders))
536     if document.book_info.thanks:
537         document.edoc.getroot().set('thanks', document.book_info.thanks)
538
539     output = epub.EpubBook()
540     output.set_identifier(six.text_type(document.book_info.url))
541     output.set_language(functions.lang_code_3to2(document.book_info.language))
542     output.set_title(document.book_info.title)
543     for author in document.book_info.authors:
544         output.add_author(
545             author.readable(),
546             file_as=six.text_type(author)
547         )
548     for translator in document.book_info.translators:
549         output.add_author(
550             translator.readable(),
551             file_as=six.text_type(translator),
552             role='translator'
553         )
554     for publisher in document.book_info.publisher:
555         output.add_metadata("DC", "publisher", publisher)
556     output.add_metadata("DC", "date", document.book_info.created_at)
557
558     output.guide.append({
559         "type": "text",
560         "title": "Początek",
561         "href": "part1.xhtml"
562     })
563
564     output.add_item(epub.EpubNcx())
565
566     spine = output.spine
567
568     functions.reg_mathml_epub(output)
569
570     if os.path.isdir(ilustr_path):
571         ilustr_elements = set(ilustr.get('src')
572                               for ilustr in document.edoc.findall('//ilustr'))
573         for i, filename in enumerate(os.listdir(ilustr_path)):
574             if filename not in ilustr_elements:
575                 continue
576             file_path = os.path.join(ilustr_path, filename)
577             with open(file_path, 'rb') as f:
578                 output.add_item(
579                     epub.EpubItem(
580                         uid='image%s' % i,
581                         file_name=filename,
582                         media_type=guess_type(file_path)[0],
583                         content=f.read()
584                     )
585                 )
586
587     # write static elements
588
589     with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
590         output.add_item(
591             epub.EpubItem(
592                 uid="logo_wolnelektury.png",
593                 file_name="logo_wolnelektury.png",
594                 media_type="image/png",
595                 content=f.read()
596             )
597         )
598     with open(get_resource('res/jedenprocent.png'), 'rb') as f:
599         output.add_item(
600             epub.EpubItem(
601                 uid="jedenprocent",
602                 file_name="jedenprocent.png",
603                 media_type="image/png",
604                 content=f.read()
605             )
606         )
607
608     if not style:
609         style = get_resource('epub/style.css')
610     with open(style, 'rb') as f:
611         output.add_item(
612             epub.EpubItem(
613                 uid="style",
614                 file_name="style.css",
615                 media_type="text/css",
616                 content=f.read()
617             )
618         )
619
620     if cover:
621         if cover is True:
622             cover = make_cover
623
624         cover_file = six.BytesIO()
625         bound_cover = cover(document.book_info)
626         bound_cover.save(cover_file)
627         cover_name = 'cover.%s' % bound_cover.ext()
628
629         output.set_cover(
630             file_name=cover_name,
631             content=cover_file.getvalue(),
632         )
633         spine.append('cover')
634         output.guide.append({
635             "type": "cover",
636             "href": "cover.xhtml",
637             "title": "Okładka",
638         })
639
640         del cover_file
641
642         if bound_cover.uses_dc_cover:
643             if document.book_info.cover_by:
644                 document.edoc.getroot().set('data-cover-by',
645                                             document.book_info.cover_by)
646             if document.book_info.cover_source:
647                 document.edoc.getroot().set('data-cover-source',
648                                             document.book_info.cover_source)
649
650     annotations = etree.Element('annotations')
651
652     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
653     output.toc = toc[0][1]
654
655     if len(toc) < 2:
656         output.toc.append(
657             epub.Link(
658                 "part1.xhtml",
659                 "Początek utworu",
660                 "part1"
661             )
662         )
663
664     # Last modifications in container files and EPUB creation
665     if len(annotations) > 0:
666         output.toc.append(
667             epub.Link(
668                 "annotations.xhtml",
669                 "Przypisy",
670                 "annotations"
671             )
672         )
673         replace_by_verse(annotations)
674         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
675         chars = chars.union(used_chars(html_tree.getroot()))
676
677         item = epub.EpubItem(
678             uid="annotations",
679             file_name="annotations.xhtml",
680             media_type="application/xhtml+xml",
681             content=etree.tostring(
682                 html_tree, pretty_print=True, xml_declaration=True,
683                 encoding="utf-8",
684                 doctype='<!DOCTYPE html>'
685             )
686         )
687         output.add_item(item)
688         spine.append(item)
689
690     output.toc.append(
691         epub.Link(
692             "support.xhtml",
693             "Wesprzyj Wolne Lektury",
694             "support"
695         )
696     )
697     with open(get_resource('epub/support.xhtml'), 'rb') as f:
698         html_string = f.read()
699     chars.update(used_chars(etree.fromstring(html_string)))
700     item = epub.EpubItem(
701         uid="support",
702         file_name="support.xhtml",
703         media_type="application/xhtml+xml",
704         content=squeeze_whitespace(html_string)
705     )
706     output.add_item(item)
707     spine.append(item)
708
709     output.toc.append(
710         epub.Link(
711             "last.xhtml",
712             "Strona redakcyjna",
713             "last"
714         )
715     )
716     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
717                      outputtype=output_type)
718     chars.update(used_chars(html_tree.getroot()))
719     item = epub.EpubItem(
720         uid="last",
721         file_name="last.xhtml",
722         media_type="application/xhtml+xml",
723         content=squeeze_whitespace(etree.tostring(
724             html_tree, pretty_print=True, xml_declaration=True,
725             encoding="utf-8",
726             doctype='<!DOCTYPE html>'
727         ))
728     )
729     output.add_item(item)
730     spine.append(item)
731
732     if not flags or 'without-fonts' not in flags:
733         # strip fonts
734         tmpdir = mkdtemp('-librarian-epub')
735         try:
736             cwd = os.getcwd()
737         except OSError:
738             cwd = None
739
740         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
741                               'font-optimizer'))
742         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
743                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
744             optimizer_call = ['perl', 'subset.pl', '--chars',
745                               ''.join(chars).encode('utf-8'),
746                               get_resource('fonts/' + fname),
747                               os.path.join(tmpdir, fname)]
748             env = {"PERL_USE_UNSAFE_INC": "1"}
749             if verbose:
750                 print("Running font-optimizer")
751                 subprocess.check_call(optimizer_call, env=env)
752             else:
753                 dev_null = open(os.devnull, 'w')
754                 subprocess.check_call(optimizer_call, stdout=dev_null,
755                                       stderr=dev_null, env=env)
756             with open(os.path.join(tmpdir, fname), 'rb') as f:
757                 output.add_item(
758                     epub.EpubItem(
759                         uid=fname,
760                         file_name=fname,
761                         media_type="font/ttf",
762                         content=f.read()
763                     )
764                 )
765         rmtree(tmpdir)
766         if cwd is not None:
767             os.chdir(cwd)
768
769     remove_empty_lists_from_toc(output.toc)
770
771     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
772                                      delete=False)
773     output_file.close()
774     epub.write_epub(output_file.name, output, {'epub3_landmark': False})
775     return OutputFile.from_filename(output_file.name)