1 # -*- coding: utf-8 -*-
 
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 
   6 from __future__ import print_function, unicode_literals
 
  13 from copy import deepcopy
 
  14 from mimetypes import guess_type
 
  16 from ebooklib import epub
 
  17 from lxml import etree
 
  19 from tempfile import mkdtemp, NamedTemporaryFile
 
  20 from shutil import rmtree
 
  22 from librarian import RDFNS, WLNS, DCNS, OutputFile
 
  23 from librarian.cover import make_cover
 
  25 from librarian import functions, get_resource
 
  27 from librarian.hyphenator import Hyphenator
 
  29 functions.reg_person_name()
 
  32 def squeeze_whitespace(s):
 
  34     return re.sub(b'\\s+', b' ', s)
 
  37 def set_hyph_language(source_tree):
 
  38     bibl_lng = etree.XPath('//dc:language//text()',
 
  39                            namespaces={'dc': str(DCNS)})(source_tree)
 
  40     short_lng = functions.lang_code_3to2(bibl_lng[0])
 
  42         return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
 
  48 def hyphenate_and_fix_conjunctions(source_tree, hyph):
 
  49     texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
 
  51         parent = t.getparent()
 
  54             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
 
  56                 newt += hyph.inserted(w, u'\u00AD')
 
  59         newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
 
  67     """ Find out a node's name
 
  69     >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
 
  73     tempnode = deepcopy(node)
 
  75     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
 
  76         for e in tempnode.findall('.//%s' % p):
 
  80     etree.strip_tags(tempnode, '*')
 
  84 def xslt(xml, sheet, **kwargs):
 
  85     if isinstance(xml, etree._Element):
 
  86         xml = etree.ElementTree(xml)
 
  87     with open(sheet) as xsltf:
 
  88         transform = etree.XSLT(etree.parse(xsltf))
 
  90             (key, transform.strparam(value))
 
  91             for key, value in kwargs.items()
 
  93         return transform(xml, **params)
 
  96 def replace_characters(node):
 
  97     def replace_chars(text):
 
 100         return text.replace(u"\ufeff", u"")\
 
 101                    .replace("---", u"\u2014")\
 
 102                    .replace("--", u"\u2013")\
 
 103                    .replace(",,", u"\u201E")\
 
 104                    .replace('"', u"\u201D")\
 
 105                    .replace("'", u"\u2019")
 
 106     if node.tag in ('uwaga', 'extra'):
 
 110     node.text = replace_chars(node.text)
 
 111     node.tail = replace_chars(node.tail)
 
 113         replace_characters(child)
 
 116 def find_annotations(annotations, source, part_no):
 
 118         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 
 119             annotation = deepcopy(child)
 
 120             number = str(len(annotations) + 1)
 
 121             annotation.set('number', number)
 
 122             annotation.set('part', str(part_no))
 
 124             annotations.append(annotation)
 
 129         if child.tag not in ('extra', 'uwaga'):
 
 130             find_annotations(annotations, child, part_no)
 
 133 class Stanza(object):
 
 135     Converts / verse endings into verse elements in a stanza.
 
 137     Slashes may only occur directly in the stanza. Any slashes in subelements
 
 138     will be ignored, and the subelements will be put inside verse elements.
 
 140     >>> s = etree.fromstring(
 
 141     ...         "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
 
 143     >>> Stanza(s).versify()
 
 144     >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
 
 146       <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
 
 147       <wers_normalny>b<x>x/
 
 148     y</x>c</wers_normalny>
 
 149       <wers_normalny>d</wers_normalny>
 
 153     def __init__(self, stanza_elem):
 
 154         self.stanza = stanza_elem
 
 156         self.open_verse = None
 
 159         self.push_text(self.stanza.text)
 
 160         for elem in self.stanza:
 
 162             self.push_text(elem.tail)
 
 163         tail = self.stanza.tail
 
 165         self.stanza.tail = tail
 
 167             verse for verse in self.verses
 
 168             if verse.text or len(verse) > 0
 
 171     def open_normal_verse(self):
 
 172         self.open_verse = self.stanza.makeelement("wers_normalny")
 
 173         self.verses.append(self.open_verse)
 
 175     def get_open_verse(self):
 
 176         if self.open_verse is None:
 
 177             self.open_normal_verse()
 
 178         return self.open_verse
 
 180     def push_text(self, text):
 
 183         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 
 185                 self.open_normal_verse()
 
 186             if not verse_text.strip():
 
 188             verse = self.get_open_verse()
 
 190                 verse[-1].tail = (verse[-1].tail or "") + verse_text
 
 192                 verse.text = (verse.text or "") + verse_text
 
 194     def push_elem(self, elem):
 
 195         if elem.tag.startswith("wers"):
 
 196             verse = deepcopy(elem)
 
 198             self.verses.append(verse)
 
 199             self.open_verse = verse
 
 201             appended = deepcopy(elem)
 
 203             self.get_open_verse().append(appended)
 
 206 def replace_by_verse(tree):
 
 207     """ Find stanzas and create new verses in place of a '/' character """
 
 209     stanzas = tree.findall('.//' + WLNS('strofa'))
 
 210     for stanza in stanzas:
 
 211         Stanza(stanza).versify()
 
 214 def used_chars(element):
 
 215     """ Lists characters used in an ETree Element """
 
 216     chars = set((element.text or '') + (element.tail or ''))
 
 217     for child in element:
 
 218         chars = chars.union(used_chars(child))
 
 223     """ divide main content of the XML file into chunks """
 
 225     # prepare a container for each chunk
 
 226     part_xml = etree.Element('utwor')
 
 227     etree.SubElement(part_xml, 'master')
 
 228     main_xml_part = part_xml[0]  # master
 
 230     last_node_part = False
 
 232     # The below loop are workaround for a problem with epubs
 
 233     # in drama ebooks without acts.
 
 236     for one_part in main_text:
 
 238         if name == 'naglowek_scena':
 
 240         elif name == 'naglowek_akt':
 
 243     for one_part in main_text:
 
 245         if is_act is False and is_scene is True:
 
 246             if name == 'naglowek_czesc':
 
 248                 last_node_part = True
 
 249                 main_xml_part[:] = [deepcopy(one_part)]
 
 250             elif not last_node_part and name == "naglowek_scena":
 
 252                 main_xml_part[:] = [deepcopy(one_part)]
 
 254                 main_xml_part.append(deepcopy(one_part))
 
 255                 last_node_part = False
 
 257             if name == 'naglowek_czesc':
 
 259                 last_node_part = True
 
 260                 main_xml_part[:] = [deepcopy(one_part)]
 
 261             elif (not last_node_part
 
 263                       "naglowek_rozdzial", "naglowek_akt", "srodtytul"
 
 266                 main_xml_part[:] = [deepcopy(one_part)]
 
 268                 main_xml_part.append(deepcopy(one_part))
 
 269                 last_node_part = False
 
 273 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
 
 274                     _empty_html_static=[]):
 
 276     Transforms one chunk, returns a HTML string, a TOC object
 
 277     and a set of used characters.
 
 281     for element in chunk_xml[0]:
 
 282         if element.tag == "naglowek_czesc":
 
 286                         "part%d.xhtml#book-text" % chunk_no,
 
 288                         "part%d-text" % chunk_no
 
 293         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 
 297                         "part%d.xhtml" % chunk_no,
 
 304         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 
 309                             "part%d.xhtml" % chunk_no,
 
 317             subnumber = len(toc[-1][1])
 
 320                     "part%d.xhtml#sub%d" % (chunk_no, subnumber),
 
 322                     "part%d-sub%d" % (chunk_no, subnumber)
 
 325             element.set('sub', six.text_type(subnumber))
 
 327         if not _empty_html_static:
 
 328             with open(get_resource('epub/emptyChunk.xhtml')) as f:
 
 329                 _empty_html_static.append(f.read())
 
 331         output_html = _empty_html_static[0]
 
 333         find_annotations(annotations, chunk_xml, chunk_no)
 
 334         replace_by_verse(chunk_xml)
 
 335         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 
 336         chars = used_chars(html_tree.getroot())
 
 337         output_html = etree.tostring(
 
 338             html_tree, pretty_print=True, xml_declaration=True,
 
 340             doctype='<!DOCTYPE html>'
 
 342     return output_html, toc, chars
 
 345 def remove_empty_lists_from_toc(toc):
 
 346     for i, e in enumerate(toc):
 
 347         if isinstance(e, tuple):
 
 349                 remove_empty_lists_from_toc(e[1])
 
 355 def transform_file(wldoc, chunk_counter=1, first=True, sample=None, hyphenate=False, output_type='epub', spine=None, output=None, annotations=None):
 
 356         """ processes one input file and proceeds to its children """
 
 358         replace_characters(wldoc.edoc.getroot())
 
 360         hyphenator = set_hyph_language(
 
 362         ) if hyphenate else None
 
 363         hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
 
 365         # every input file will have a TOC entry,
 
 366         # pointing to starting chunk
 
 370                     "part%d.xhtml" % chunk_counter,
 
 371                     wldoc.book_info.title,
 
 372                     "path%d-start" % chunk_counter
 
 379             # write book title page
 
 380             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
 
 381                              outputtype=output_type)
 
 382             chars = used_chars(html_tree.getroot())
 
 383             html_string = etree.tostring(
 
 384                 html_tree, pretty_print=True, xml_declaration=True,
 
 386                 doctype='<!DOCTYPE html>'
 
 388             item = epub.EpubItem(
 
 390                 file_name="title.xhtml",
 
 391                 media_type="application/xhtml+xml",
 
 392                 content=squeeze_whitespace(html_string)
 
 395             output.add_item(item)
 
 396             # add a title page TOC entry
 
 405             item = epub.EpubNav()
 
 413             output.add_item(item)
 
 424         elif wldoc.book_info.parts:
 
 425             # write title page for every parent
 
 426             if sample is not None and sample <= 0:
 
 429                     get_resource('epub/emptyChunk.xhtml')).read()
 
 431                 html_tree = xslt(wldoc.edoc,
 
 432                                  get_resource('epub/xsltChunkTitle.xsl'))
 
 433                 chars = used_chars(html_tree.getroot())
 
 434                 html_string = etree.tostring(
 
 435                     html_tree, pretty_print=True, xml_declaration=True,
 
 437                     doctype='<!DOCTYPE html>'
 
 439             item = epub.EpubItem(
 
 440                 uid="part%d" % chunk_counter,
 
 441                 file_name="part%d.xhtml" % chunk_counter,
 
 442                 media_type="application/xhtml+xml",
 
 443                 content=squeeze_whitespace(html_string)
 
 445             output.add_item(item)
 
 450         if len(wldoc.edoc.getroot()) > 1:
 
 451             # rdf before style master
 
 452             main_text = wldoc.edoc.getroot()[1]
 
 454             # rdf in style master
 
 455             main_text = wldoc.edoc.getroot()[0]
 
 456             if main_text.tag == RDFNS('RDF'):
 
 459         if main_text is not None:
 
 460             for chunk_xml in chop(main_text):
 
 462                 if sample is not None:
 
 466                         sample -= len(chunk_xml.xpath(
 
 467                             '//strofa|//akap|//akap_cd|//akap_dialog'
 
 469                 chunk_html, chunk_toc, chunk_chars = transform_chunk(
 
 470                     chunk_xml, chunk_counter, annotations, empty)
 
 472                 toc[-1][1].extend(chunk_toc)
 
 473                 chars = chars.union(chunk_chars)
 
 474                 item = epub.EpubItem(
 
 475                     uid="part%d" % chunk_counter,
 
 476                     file_name="part%d.xhtml" % chunk_counter,
 
 477                     media_type="application/xhtml+xml",
 
 478                     content=squeeze_whitespace(chunk_html)
 
 480                 output.add_item(item)
 
 484         for child in wldoc.parts():
 
 485             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 
 486                 child, chunk_counter, first=False, sample=sample,
 
 487                 hyphenate=hyphenate, output_type=output_type,
 
 488                 spine=spine, output=output, annotations=annotations,
 
 490             toc[-1][1].extend(child_toc)
 
 491             chars = chars.union(chunk_chars)
 
 493         return toc, chunk_counter, chars, sample
 
 496 def transform(wldoc, verbose=False, style=None,
 
 497               sample=None, cover=None, flags=None, hyphenate=False,
 
 498               base_url='file://./', output_type='epub'):
 
 499     """ produces a EPUB file
 
 501     sample=n: generate sample e-book (with at least n paragraphs)
 
 502     cover: a cover.Cover factory or True for default
 
 503     flags: less-advertising, without-fonts, working-copy
 
 507     document = deepcopy(wldoc)
 
 512             document.edoc.getroot().set(flag, 'yes')
 
 514     document.clean_ed_note()
 
 515     document.clean_ed_note('abstrakt')
 
 518     editors = document.editors()
 
 520         document.edoc.getroot().set('editors', u', '.join(sorted(
 
 521             editor.readable() for editor in editors)))
 
 522     if document.book_info.funders:
 
 523         document.edoc.getroot().set('funders', u', '.join(
 
 524             document.book_info.funders))
 
 525     if document.book_info.thanks:
 
 526         document.edoc.getroot().set('thanks', document.book_info.thanks)
 
 528     output = epub.EpubBook()
 
 529     output.set_identifier(six.text_type(document.book_info.url))
 
 530     output.set_language(functions.lang_code_3to2(document.book_info.language))
 
 531     output.set_title(document.book_info.title)
 
 532     for i, author in enumerate(document.book_info.authors):
 
 535             file_as=six.text_type(author),
 
 536             uid='creator{}'.format(i)
 
 538     for translator in document.book_info.translators:
 
 540             translator.readable(),
 
 541             file_as=six.text_type(translator),
 
 543             uid='translator{}'.format(i)
 
 545     for publisher in document.book_info.publisher:
 
 546         output.add_metadata("DC", "publisher", publisher)
 
 547     output.add_metadata("DC", "date", document.book_info.created_at)
 
 549     output.guide.append({
 
 552         "href": "part1.xhtml"
 
 555     output.add_item(epub.EpubNcx())
 
 559     functions.reg_mathml_epub(output)
 
 562     for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
 
 563         url = six.moves.urllib.parse.urljoin(
 
 567         imgfile = six.moves.urllib.request.urlopen(url)
 
 568         img = Image.open(imgfile)
 
 570         th_format, ext, media_type = {
 
 571             'GIF': ('GIF', 'gif', 'image/gif'),
 
 572             'PNG': ('PNG', 'png', 'image/png'),
 
 573         }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
 
 576         if img.size[0] < width:
 
 579             th = img.resize((width, round(width * img.size[1] / img.size[0])))
 
 583         buffer = six.BytesIO()
 
 584         th.save(buffer, format=th_format)
 
 586         file_name = 'image%d.%s' % (i, ext)
 
 587         ilustr.set('src', file_name)
 
 592                 media_type=media_type,
 
 593                 content=buffer.getvalue()
 
 597     # write static elements
 
 599     with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
 
 602                 uid="logo_wolnelektury.png",
 
 603                 file_name="logo_wolnelektury.png",
 
 604                 media_type="image/png",
 
 608     with open(get_resource('res/jedenprocent.png'), 'rb') as f:
 
 612                 file_name="jedenprocent.png",
 
 613                 media_type="image/png",
 
 619         style = get_resource('epub/style.css')
 
 620     with open(style, 'rb') as f:
 
 624                 file_name="style.css",
 
 625                 media_type="text/css",
 
 634         cover_file = six.BytesIO()
 
 635         bound_cover = cover(document.book_info)
 
 636         bound_cover.save(cover_file)
 
 637         cover_name = 'cover.%s' % bound_cover.ext()
 
 640             file_name=cover_name,
 
 641             content=cover_file.getvalue(),
 
 643         spine.append('cover')
 
 644         output.guide.append({
 
 646             "href": "cover.xhtml",
 
 652         if bound_cover.uses_dc_cover:
 
 653             if document.book_info.cover_by:
 
 654                 document.edoc.getroot().set('data-cover-by',
 
 655                                             document.book_info.cover_by)
 
 656             if document.book_info.cover_source:
 
 657                 document.edoc.getroot().set('data-cover-source',
 
 658                                             document.book_info.cover_source)
 
 660     annotations = etree.Element('annotations')
 
 662     toc, chunk_counter, chars, sample = transform_file(
 
 663         document, sample=sample,
 
 664         hyphenate=hyphenate, output_type=output_type,
 
 665         spine=spine, output=output, annotations=annotations
 
 667     output.toc = toc[0][1]
 
 669     # Last modifications in container files and EPUB creation
 
 670     if len(annotations) > 0:
 
 678         replace_by_verse(annotations)
 
 679         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 
 680         chars = chars.union(used_chars(html_tree.getroot()))
 
 682         item = epub.EpubItem(
 
 684             file_name="annotations.xhtml",
 
 685             media_type="application/xhtml+xml",
 
 686             content=etree.tostring(
 
 687                 html_tree, pretty_print=True, xml_declaration=True,
 
 689                 doctype='<!DOCTYPE html>'
 
 692         output.add_item(item)
 
 698             "Wesprzyj Wolne Lektury",
 
 702     with open(get_resource('epub/support.xhtml'), 'rb') as f:
 
 703         html_string = f.read()
 
 704     chars.update(used_chars(etree.fromstring(html_string)))
 
 705     item = epub.EpubItem(
 
 707         file_name="support.xhtml",
 
 708         media_type="application/xhtml+xml",
 
 709         content=squeeze_whitespace(html_string)
 
 711     output.add_item(item)
 
 721     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
 
 722                      outputtype=output_type)
 
 723     chars.update(used_chars(html_tree.getroot()))
 
 724     item = epub.EpubItem(
 
 726         file_name="last.xhtml",
 
 727         media_type="application/xhtml+xml",
 
 728         content=squeeze_whitespace(etree.tostring(
 
 729             html_tree, pretty_print=True, xml_declaration=True,
 
 731             doctype='<!DOCTYPE html>'
 
 734     output.add_item(item)
 
 737     if not flags or 'without-fonts' not in flags:
 
 739         tmpdir = mkdtemp('-librarian-epub')
 
 745         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
 
 747         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
 
 748                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
 
 749             optimizer_call = ['perl', 'subset.pl', '--chars',
 
 750                               ''.join(chars).encode('utf-8'),
 
 751                               get_resource('fonts/' + fname),
 
 752                               os.path.join(tmpdir, fname)]
 
 753             env = {"PERL_USE_UNSAFE_INC": "1"}
 
 755                 print("Running font-optimizer")
 
 756                 subprocess.check_call(optimizer_call, env=env)
 
 758                 dev_null = open(os.devnull, 'w')
 
 759                 subprocess.check_call(optimizer_call, stdout=dev_null,
 
 760                                       stderr=dev_null, env=env)
 
 761             with open(os.path.join(tmpdir, fname), 'rb') as f:
 
 766                         media_type="font/ttf",
 
 774     remove_empty_lists_from_toc(output.toc)
 
 776     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
 
 779     epub.write_epub(output_file.name, output, {'epub3_landmark': False})
 
 780     return OutputFile.from_filename(output_file.name)