1 # -*- coding: utf-8 -*-
 
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 
   6 from __future__ import print_function, unicode_literals
 
  13 from copy import deepcopy
 
  14 from mimetypes import guess_type
 
  16 from ebooklib import epub
 
  17 from lxml import etree
 
  18 from tempfile import mkdtemp, NamedTemporaryFile
 
  19 from shutil import rmtree
 
  21 from librarian import RDFNS, WLNS, DCNS, OutputFile
 
  22 from librarian.cover import make_cover
 
  24 from librarian import functions, get_resource
 
  26 from librarian.hyphenator import Hyphenator
 
  28 functions.reg_person_name()
 
  31 def squeeze_whitespace(s):
 
  32     return re.sub(b'\\s+', b' ', s)
 
  35 def set_hyph_language(source_tree):
 
  36     bibl_lng = etree.XPath('//dc:language//text()',
 
  37                            namespaces={'dc': str(DCNS)})(source_tree)
 
  38     short_lng = functions.lang_code_3to2(bibl_lng[0])
 
  40         return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
 
  46 def hyphenate_and_fix_conjunctions(source_tree, hyph):
 
  47     texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
 
  49         parent = t.getparent()
 
  52             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
 
  54                 newt += hyph.inserted(w, u'\u00AD')
 
  57         newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
 
  65     """ returns node's text and children as a string
 
  67     >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
 
  71     nt = node.text if node.text is not None else ''
 
  73         [nt] + [etree.tostring(child, encoding='unicode') for child in node]
 
  77 def set_inner_xml(node, text):
 
  78     """ sets node's text and children from a string
 
  80     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
 
  81     >>> set_inner_xml(e, 'x<b>y</b>z')
 
  82     >>> print(etree.tostring(e, encoding='unicode'))
 
  86     p = etree.fromstring('<x>%s</x>' % text)
 
  92     """ Find out a node's name
 
  94     >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
 
  98     tempnode = deepcopy(node)
 
 100     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
 
 101         for e in tempnode.findall('.//%s' % p):
 
 105     etree.strip_tags(tempnode, '*')
 
 109 def xslt(xml, sheet, **kwargs):
 
 110     if isinstance(xml, etree._Element):
 
 111         xml = etree.ElementTree(xml)
 
 112     with open(sheet) as xsltf:
 
 113         transform = etree.XSLT(etree.parse(xsltf))
 
 115             (key, transform.strparam(value))
 
 116             for key, value in kwargs.items()
 
 118         return transform(xml, **params)
 
 121 def replace_characters(node):
 
 122     def replace_chars(text):
 
 125         return text.replace(u"\ufeff", u"")\
 
 126                    .replace("---", u"\u2014")\
 
 127                    .replace("--", u"\u2013")\
 
 128                    .replace(",,", u"\u201E")\
 
 129                    .replace('"', u"\u201D")\
 
 130                    .replace("'", u"\u2019")
 
 131     if node.tag in ('uwaga', 'extra'):
 
 135     node.text = replace_chars(node.text)
 
 136     node.tail = replace_chars(node.tail)
 
 138         replace_characters(child)
 
 141 def find_annotations(annotations, source, part_no):
 
 143         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 
 144             annotation = deepcopy(child)
 
 145             number = str(len(annotations) + 1)
 
 146             annotation.set('number', number)
 
 147             annotation.set('part', str(part_no))
 
 149             annotations.append(annotation)
 
 154         if child.tag not in ('extra', 'uwaga'):
 
 155             find_annotations(annotations, child, part_no)
 
 158 class Stanza(object):
 
 160     Converts / verse endings into verse elements in a stanza.
 
 162     Slashes may only occur directly in the stanza. Any slashes in subelements
 
 163     will be ignored, and the subelements will be put inside verse elements.
 
 165     >>> s = etree.fromstring(
 
 166     ...         "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
 
 168     >>> Stanza(s).versify()
 
 169     >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
 
 171       <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
 
 172       <wers_normalny>b<x>x/
 
 173     y</x>c</wers_normalny>
 
 174       <wers_normalny>d</wers_normalny>
 
 178     def __init__(self, stanza_elem):
 
 179         self.stanza = stanza_elem
 
 181         self.open_verse = None
 
 184         self.push_text(self.stanza.text)
 
 185         for elem in self.stanza:
 
 187             self.push_text(elem.tail)
 
 188         tail = self.stanza.tail
 
 190         self.stanza.tail = tail
 
 192             verse for verse in self.verses
 
 193             if verse.text or len(verse) > 0
 
 196     def open_normal_verse(self):
 
 197         self.open_verse = self.stanza.makeelement("wers_normalny")
 
 198         self.verses.append(self.open_verse)
 
 200     def get_open_verse(self):
 
 201         if self.open_verse is None:
 
 202             self.open_normal_verse()
 
 203         return self.open_verse
 
 205     def push_text(self, text):
 
 208         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 
 210                 self.open_normal_verse()
 
 211             if not verse_text.strip():
 
 213             verse = self.get_open_verse()
 
 215                 verse[-1].tail = (verse[-1].tail or "") + verse_text
 
 217                 verse.text = (verse.text or "") + verse_text
 
 219     def push_elem(self, elem):
 
 220         if elem.tag.startswith("wers"):
 
 221             verse = deepcopy(elem)
 
 223             self.verses.append(verse)
 
 224             self.open_verse = verse
 
 226             appended = deepcopy(elem)
 
 228             self.get_open_verse().append(appended)
 
 231 def replace_by_verse(tree):
 
 232     """ Find stanzas and create new verses in place of a '/' character """
 
 234     stanzas = tree.findall('.//' + WLNS('strofa'))
 
 235     for stanza in stanzas:
 
 236         Stanza(stanza).versify()
 
 239 def used_chars(element):
 
 240     """ Lists characters used in an ETree Element """
 
 241     chars = set((element.text or '') + (element.tail or ''))
 
 242     for child in element:
 
 243         chars = chars.union(used_chars(child))
 
 248     """ divide main content of the XML file into chunks """
 
 250     # prepare a container for each chunk
 
 251     part_xml = etree.Element('utwor')
 
 252     etree.SubElement(part_xml, 'master')
 
 253     main_xml_part = part_xml[0]  # master
 
 255     last_node_part = False
 
 257     # The below loop are workaround for a problem with epubs
 
 258     # in drama ebooks without acts.
 
 261     for one_part in main_text:
 
 263         if name == 'naglowek_scena':
 
 265         elif name == 'naglowek_akt':
 
 268     for one_part in main_text:
 
 270         if is_act is False and is_scene is True:
 
 271             if name == 'naglowek_czesc':
 
 273                 last_node_part = True
 
 274                 main_xml_part[:] = [deepcopy(one_part)]
 
 275             elif not last_node_part and name == "naglowek_scena":
 
 277                 main_xml_part[:] = [deepcopy(one_part)]
 
 279                 main_xml_part.append(deepcopy(one_part))
 
 280                 last_node_part = False
 
 282             if name == 'naglowek_czesc':
 
 284                 last_node_part = True
 
 285                 main_xml_part[:] = [deepcopy(one_part)]
 
 286             elif (not last_node_part
 
 288                       "naglowek_rozdzial", "naglowek_akt", "srodtytul"
 
 291                 main_xml_part[:] = [deepcopy(one_part)]
 
 293                 main_xml_part.append(deepcopy(one_part))
 
 294                 last_node_part = False
 
 298 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
 
 299                     _empty_html_static=[]):
 
 301     Transforms one chunk, returns a HTML string, a TOC object
 
 302     and a set of used characters.
 
 306     for element in chunk_xml[0]:
 
 307         if element.tag == "naglowek_czesc":
 
 311                         "part%d.xhtml#book-text" % chunk_no,
 
 313                         "part%d-text" % chunk_no
 
 318         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 
 322                         "part%d.xhtml" % chunk_no,
 
 329         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 
 334                             "part%d.xhtml" % chunk_no,
 
 342             subnumber = len(toc[-1][1])
 
 345                     "part%d.xhtml#sub%d" % (chunk_no, subnumber),
 
 347                     "part%d-sub%d" % (chunk_no, subnumber)
 
 350             element.set('sub', six.text_type(subnumber))
 
 352         if not _empty_html_static:
 
 353             with open(get_resource('epub/emptyChunk.xhtml')) as f:
 
 354                 _empty_html_static.append(f.read())
 
 356         output_html = _empty_html_static[0]
 
 358         find_annotations(annotations, chunk_xml, chunk_no)
 
 359         replace_by_verse(chunk_xml)
 
 360         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 
 361         chars = used_chars(html_tree.getroot())
 
 362         output_html = etree.tostring(
 
 363             html_tree, pretty_print=True, xml_declaration=True,
 
 365             doctype='<!DOCTYPE html>'
 
 367     return output_html, toc, chars
 
 370 def remove_empty_lists_from_toc(toc):
 
 371     for i, e in enumerate(toc):
 
 372         if isinstance(e, tuple):
 
 374                 remove_empty_lists_from_toc(e[1])
 
 379 def transform(wldoc, verbose=False, style=None,
 
 380               sample=None, cover=None, flags=None, hyphenate=False,
 
 381               ilustr_path='', output_type='epub'):
 
 382     """ produces a EPUB file
 
 384     sample=n: generate sample e-book (with at least n paragraphs)
 
 385     cover: a cover.Cover factory or True for default
 
 386     flags: less-advertising, without-fonts, working-copy
 
 389     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 
 390         """ processes one input file and proceeds to its children """
 
 392         replace_characters(wldoc.edoc.getroot())
 
 394         hyphenator = set_hyph_language(
 
 396         ) if hyphenate else None
 
 397         hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
 
 399         # every input file will have a TOC entry,
 
 400         # pointing to starting chunk
 
 404                     "part%d.xhtml" % chunk_counter,
 
 405                     wldoc.book_info.title,
 
 406                     "path%d-start" % chunk_counter
 
 413             # write book title page
 
 414             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
 
 415                              outputtype=output_type)
 
 416             chars = used_chars(html_tree.getroot())
 
 417             html_string = etree.tostring(
 
 418                 html_tree, pretty_print=True, xml_declaration=True,
 
 420                 doctype='<!DOCTYPE html>'
 
 422             item = epub.EpubItem(
 
 424                 file_name="title.xhtml",
 
 425                 media_type="application/xhtml+xml",
 
 426                 content=squeeze_whitespace(html_string)
 
 429             output.add_item(item)
 
 430             # add a title page TOC entry
 
 439             item = epub.EpubNav()
 
 447             output.add_item(item)
 
 450         elif wldoc.book_info.parts:
 
 451             # write title page for every parent
 
 452             if sample is not None and sample <= 0:
 
 455                     get_resource('epub/emptyChunk.xhtml')).read()
 
 457                 html_tree = xslt(wldoc.edoc,
 
 458                                  get_resource('epub/xsltChunkTitle.xsl'))
 
 459                 chars = used_chars(html_tree.getroot())
 
 460                 html_string = etree.tostring(
 
 461                     html_tree, pretty_print=True, xml_declaration=True,
 
 463                     doctype='<!DOCTYPE html>'
 
 465             item = epub.EpubItem(
 
 466                 uid="part%d" % chunk_counter,
 
 467                 file_name="part%d.xhtml" % chunk_counter,
 
 468                 media_type="application/xhtml+xml",
 
 469                 content=squeeze_whitespace(html_string)
 
 471             output.add_item(item)
 
 476         if len(wldoc.edoc.getroot()) > 1:
 
 477             # rdf before style master
 
 478             main_text = wldoc.edoc.getroot()[1]
 
 480             # rdf in style master
 
 481             main_text = wldoc.edoc.getroot()[0]
 
 482             if main_text.tag == RDFNS('RDF'):
 
 485         if main_text is not None:
 
 486             for chunk_xml in chop(main_text):
 
 488                 if sample is not None:
 
 492                         sample -= len(chunk_xml.xpath(
 
 493                             '//strofa|//akap|//akap_cd|//akap_dialog'
 
 495                 chunk_html, chunk_toc, chunk_chars = transform_chunk(
 
 496                     chunk_xml, chunk_counter, annotations, empty)
 
 498                 toc[-1][1].extend(chunk_toc)
 
 499                 chars = chars.union(chunk_chars)
 
 500                 item = epub.EpubItem(
 
 501                     uid="part%d" % chunk_counter,
 
 502                     file_name="part%d.xhtml" % chunk_counter,
 
 503                     media_type="application/xhtml+xml",
 
 504                     content=squeeze_whitespace(chunk_html)
 
 506                 output.add_item(item)
 
 510         for child in wldoc.parts():
 
 511             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 
 512                 child, chunk_counter, first=False, sample=sample)
 
 513             toc[-1][1].extend(child_toc)
 
 514             chars = chars.union(chunk_chars)
 
 516         return toc, chunk_counter, chars, sample
 
 518     document = deepcopy(wldoc)
 
 523             document.edoc.getroot().set(flag, 'yes')
 
 525     document.clean_ed_note()
 
 526     document.clean_ed_note('abstrakt')
 
 529     editors = document.editors()
 
 531         document.edoc.getroot().set('editors', u', '.join(sorted(
 
 532             editor.readable() for editor in editors)))
 
 533     if document.book_info.funders:
 
 534         document.edoc.getroot().set('funders', u', '.join(
 
 535             document.book_info.funders))
 
 536     if document.book_info.thanks:
 
 537         document.edoc.getroot().set('thanks', document.book_info.thanks)
 
 539     output = epub.EpubBook()
 
 540     output.set_identifier(six.text_type(document.book_info.url))
 
 541     output.set_language(functions.lang_code_3to2(document.book_info.language))
 
 542     output.set_title(document.book_info.title)
 
 543     for author in document.book_info.authors:
 
 546             file_as=six.text_type(author)
 
 548     for translator in document.book_info.translators:
 
 550             translator.readable(),
 
 551             file_as=six.text_type(translator),
 
 554     for publisher in document.book_info.publisher:
 
 555         output.add_metadata("DC", "publisher", publisher)
 
 556     output.add_metadata("DC", "date", document.book_info.created_at)
 
 558     output.guide.append({
 
 561         "href": "part1.xhtml"
 
 564     output.add_item(epub.EpubNcx())
 
 568     functions.reg_mathml_epub(output)
 
 570     if os.path.isdir(ilustr_path):
 
 571         ilustr_elements = set(ilustr.get('src')
 
 572                               for ilustr in document.edoc.findall('//ilustr'))
 
 573         for i, filename in enumerate(os.listdir(ilustr_path)):
 
 574             if filename not in ilustr_elements:
 
 576             file_path = os.path.join(ilustr_path, filename)
 
 577             with open(file_path, 'rb') as f:
 
 582                         media_type=guess_type(file_path)[0],
 
 587     # write static elements
 
 589     with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
 
 592                 uid="logo_wolnelektury.png",
 
 593                 file_name="logo_wolnelektury.png",
 
 594                 media_type="image/png",
 
 598     with open(get_resource('res/jedenprocent.png'), 'rb') as f:
 
 602                 file_name="jedenprocent.png",
 
 603                 media_type="image/png",
 
 609         style = get_resource('epub/style.css')
 
 610     with open(style, 'rb') as f:
 
 614                 file_name="style.css",
 
 615                 media_type="text/css",
 
 624         cover_file = six.BytesIO()
 
 625         bound_cover = cover(document.book_info)
 
 626         bound_cover.save(cover_file)
 
 627         cover_name = 'cover.%s' % bound_cover.ext()
 
 630             file_name=cover_name,
 
 631             content=cover_file.getvalue(),
 
 633         spine.append('cover')
 
 634         output.guide.append({
 
 636             "href": "cover.xhtml",
 
 642         if bound_cover.uses_dc_cover:
 
 643             if document.book_info.cover_by:
 
 644                 document.edoc.getroot().set('data-cover-by',
 
 645                                             document.book_info.cover_by)
 
 646             if document.book_info.cover_source:
 
 647                 document.edoc.getroot().set('data-cover-source',
 
 648                                             document.book_info.cover_source)
 
 650     annotations = etree.Element('annotations')
 
 652     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 
 653     output.toc = toc[0][1]
 
 664     # Last modifications in container files and EPUB creation
 
 665     if len(annotations) > 0:
 
 673         replace_by_verse(annotations)
 
 674         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 
 675         chars = chars.union(used_chars(html_tree.getroot()))
 
 677         item = epub.EpubItem(
 
 679             file_name="annotations.xhtml",
 
 680             media_type="application/xhtml+xml",
 
 681             content=etree.tostring(
 
 682                 html_tree, pretty_print=True, xml_declaration=True,
 
 684                 doctype='<!DOCTYPE html>'
 
 687         output.add_item(item)
 
 693             "Wesprzyj Wolne Lektury",
 
 697     with open(get_resource('epub/support.xhtml'), 'rb') as f:
 
 698         html_string = f.read()
 
 699     chars.update(used_chars(etree.fromstring(html_string)))
 
 700     item = epub.EpubItem(
 
 702         file_name="support.xhtml",
 
 703         media_type="application/xhtml+xml",
 
 704         content=squeeze_whitespace(html_string)
 
 706     output.add_item(item)
 
 716     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
 
 717                      outputtype=output_type)
 
 718     chars.update(used_chars(html_tree.getroot()))
 
 719     item = epub.EpubItem(
 
 721         file_name="last.xhtml",
 
 722         media_type="application/xhtml+xml",
 
 723         content=squeeze_whitespace(etree.tostring(
 
 724             html_tree, pretty_print=True, xml_declaration=True,
 
 726             doctype='<!DOCTYPE html>'
 
 729     output.add_item(item)
 
 732     if not flags or 'without-fonts' not in flags:
 
 734         tmpdir = mkdtemp('-librarian-epub')
 
 740         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
 
 742         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
 
 743                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
 
 744             optimizer_call = ['perl', 'subset.pl', '--chars',
 
 745                               ''.join(chars).encode('utf-8'),
 
 746                               get_resource('fonts/' + fname),
 
 747                               os.path.join(tmpdir, fname)]
 
 748             env = {"PERL_USE_UNSAFE_INC": "1"}
 
 750                 print("Running font-optimizer")
 
 751                 subprocess.check_call(optimizer_call, env=env)
 
 753                 dev_null = open(os.devnull, 'w')
 
 754                 subprocess.check_call(optimizer_call, stdout=dev_null,
 
 755                                       stderr=dev_null, env=env)
 
 756             with open(os.path.join(tmpdir, fname), 'rb') as f:
 
 761                         media_type="font/ttf",
 
 769     remove_empty_lists_from_toc(output.toc)
 
 771     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
 
 774     epub.write_epub(output_file.name, output, {'epub3_landmark': False})
 
 775     return OutputFile.from_filename(output_file.name)