1 # -*- coding: utf-8 -*-
 
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 
   6 from __future__ import with_statement
 
  12 from StringIO import StringIO
 
  13 from copy import deepcopy
 
  14 from lxml import etree
 
  16 from tempfile import mkdtemp, NamedTemporaryFile
 
  17 from shutil import rmtree
 
  19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
 
  20 from librarian.cover import DefaultEbookCover
 
  22 from librarian import functions, get_resource
 
  24 functions.reg_person_name()
 
  25 functions.reg_lang_code_3to2()
 
  29     """ returns node's text and children as a string
 
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
 
  35     nt = node.text if node.text is not None else ''
 
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
 
  38 def set_inner_xml(node, text):
 
  39     """ sets node's text and children from a string
 
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
 
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
 
  43     >>> print etree.tostring(e)
 
  47     p = etree.fromstring('<x>%s</x>' % text)
 
  53     """ Find out a node's name
 
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
 
  59     tempnode = deepcopy(node)
 
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
 
  62         for e in tempnode.findall('.//%s' % p):
 
  66     etree.strip_tags(tempnode, '*')
 
  71     if isinstance(xml, etree._Element):
 
  72         xml = etree.ElementTree(xml)
 
  73     with open(sheet) as xsltf:
 
  74         return xml.xslt(etree.parse(xsltf))
 
  77 def replace_characters(node):
 
  78     def replace_chars(text):
 
  81         #text = re.sub(r"(?<=\s\w)\s+", u"\u00a0", text) #fix for hanging single letter conjunctions – for future use.
 
  82         return text.replace(u"\ufeff", u"")\
 
  83                    .replace("---", u"\u2014")\
 
  84                    .replace("--", u"\u2013")\
 
  85                    .replace(",,", u"\u201E")\
 
  86                    .replace('"', u"\u201D")\
 
  87                    .replace("'", u"\u2019")
 
  88     if node.tag in ('uwaga', 'extra'):
 
  92     node.text = replace_chars(node.text)
 
  93     node.tail = replace_chars(node.tail)
 
  95         replace_characters(child)
 
  98 def find_annotations(annotations, source, part_no):
 
 100         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 
 101             annotation = deepcopy(child)
 
 102             number = str(len(annotations)+1)
 
 103             annotation.set('number', number)
 
 104             annotation.set('part', str(part_no))
 
 106             annotations.append(annotation)
 
 111         if child.tag not in ('extra', 'uwaga'):
 
 112             find_annotations(annotations, child, part_no)
 
 115 class Stanza(object):
 
 117     Converts / verse endings into verse elements in a stanza.
 
 119     Slashes may only occur directly in the stanza. Any slashes in subelements
 
 120     will be ignored, and the subelements will be put inside verse elements.
 
 122     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
 
 123     >>> Stanza(s).versify()
 
 124     >>> print etree.tostring(s)
 
 125     <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
 
 126     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 
 129     def __init__(self, stanza_elem):
 
 130         self.stanza = stanza_elem
 
 132         self.open_verse = None
 
 135         self.push_text(self.stanza.text)
 
 136         for elem in self.stanza:
 
 138             self.push_text(elem.tail)
 
 139         tail = self.stanza.tail
 
 141         self.stanza.tail = tail
 
 142         self.stanza.extend(self.verses)
 
 144     def open_normal_verse(self):
 
 145         self.open_verse = self.stanza.makeelement("wers_normalny")
 
 146         self.verses.append(self.open_verse)
 
 148     def get_open_verse(self):
 
 149         if self.open_verse is None:
 
 150             self.open_normal_verse()
 
 151         return self.open_verse
 
 153     def push_text(self, text):
 
 156         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 
 158                 self.open_normal_verse()
 
 159             verse = self.get_open_verse()
 
 161                 verse[-1].tail = (verse[-1].tail or "") + verse_text
 
 163                 verse.text = (verse.text or "") + verse_text
 
 165     def push_elem(self, elem):
 
 166         if elem.tag.startswith("wers"):
 
 167             verse = deepcopy(elem)
 
 169             self.verses.append(verse)
 
 170             self.open_verse = verse
 
 172             appended = deepcopy(elem)
 
 174             self.get_open_verse().append(appended)
 
 177 def replace_by_verse(tree):
 
 178     """ Find stanzas and create new verses in place of a '/' character """
 
 180     stanzas = tree.findall('.//' + WLNS('strofa'))
 
 181     for stanza in stanzas:
 
 182         Stanza(stanza).versify()
 
 185 def add_to_manifest(manifest, partno):
 
 186     """ Adds a node to the manifest section in content.opf file """
 
 188     partstr = 'part%d' % partno
 
 189     e = manifest.makeelement(OPFNS('item'), attrib={
 
 191                                  'href': partstr + '.html',
 
 192                                  'media-type': 'application/xhtml+xml',
 
 197 def add_to_spine(spine, partno):
 
 198     """ Adds a node to the spine section in content.opf file """
 
 200     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 
 205     def __init__(self, name=None, part_href=None):
 
 208         self.part_href = part_href
 
 209         self.sub_number = None
 
 211     def add(self, name, part_href, level=0, is_part=True, index=None):
 
 212         assert level == 0 or index is None
 
 213         if level > 0 and self.children:
 
 214             return self.children[-1].add(name, part_href, level-1, is_part)
 
 217             t.part_href = part_href
 
 218             if index is not None:
 
 219                 self.children.insert(index, t)
 
 221                 self.children.append(t)
 
 223                 t.sub_number = len(self.children) + 1
 
 226     def append(self, toc):
 
 227         self.children.append(toc)
 
 229     def extend(self, toc):
 
 230         self.children.extend(toc.children)
 
 234             return max((c.depth() for c in self.children)) + 1
 
 240         if self.sub_number is not None:
 
 241             src += '#sub%d' % self.sub_number
 
 244     def write_to_xml(self, nav_map, counter=1):
 
 245         for child in self.children:
 
 246             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 
 247             nav_point.set('id', 'NavPoint-%d' % counter)
 
 248             nav_point.set('playOrder', str(counter))
 
 250             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 
 251             text = nav_map.makeelement(NCXNS('text'))
 
 252             text.text = child.name
 
 253             nav_label.append(text)
 
 254             nav_point.append(nav_label)
 
 256             content = nav_map.makeelement(NCXNS('content'))
 
 257             content.set('src', child.href())
 
 258             nav_point.append(content)
 
 259             nav_map.append(nav_point)
 
 260             counter = child.write_to_xml(nav_point, counter + 1)
 
 263     def html_part(self, depth=0):
 
 265         for child in self.children:
 
 267                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 
 268                 (depth, child.href(), child.name))
 
 269             texts.append(child.html_part(depth+1))
 
 270         return "\n".join(texts)
 
 273         with open(get_resource('epub/toc.html')) as f:
 
 274             t = unicode(f.read(), 'utf-8')
 
 275         return t % self.html_part()
 
 278 def used_chars(element):
 
 279     """ Lists characters used in an ETree Element """
 
 280     chars = set((element.text or '') + (element.tail or ''))
 
 281     for child in element:
 
 282         chars = chars.union(used_chars(child))
 
 287     """ divide main content of the XML file into chunks """
 
 289     # prepare a container for each chunk
 
 290     part_xml = etree.Element('utwor')
 
 291     etree.SubElement(part_xml, 'master')
 
 292     main_xml_part = part_xml[0] # master
 
 294     last_node_part = False
 
 296     # the below loop are workaround for a problem with epubs in drama ebooks without acts
 
 299     for one_part in main_text:
 
 301         if name == 'naglowek_scena':
 
 303         elif name == 'naglowek_akt':
 
 306     for one_part in main_text:
 
 308         if is_act is False and is_scene is True:
 
 309             if name == 'naglowek_czesc':
 
 311                 last_node_part = True
 
 312                 main_xml_part[:] = [deepcopy(one_part)]
 
 313             elif not last_node_part and name in ("naglowek_scena"):
 
 315                 main_xml_part[:] = [deepcopy(one_part)]
 
 317                 main_xml_part.append(deepcopy(one_part))
 
 318                 last_node_part = False
 
 320             if name == 'naglowek_czesc':
 
 322                 last_node_part = True
 
 323                 main_xml_part[:] = [deepcopy(one_part)]
 
 324             elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 
 326                 main_xml_part[:] = [deepcopy(one_part)]
 
 328                 main_xml_part.append(deepcopy(one_part))
 
 329                 last_node_part = False            
 
 333 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 
 334     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 
 337     for element in chunk_xml[0]:
 
 338         if element.tag in ("naglowek_czesc"):
 
 339             toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
 
 340         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 
 341             toc.add(node_name(element), "part%d.html" % chunk_no)
 
 342         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 
 343             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 
 344             element.set('sub', str(subnumber))
 
 346         if not _empty_html_static:
 
 347             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 
 349         output_html = _empty_html_static[0]
 
 351         find_annotations(annotations, chunk_xml, chunk_no)
 
 352         replace_by_verse(chunk_xml)
 
 353         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 
 354         chars = used_chars(html_tree.getroot())
 
 355         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 
 356     return output_html, toc, chars
 
 359 def transform(wldoc, verbose=False,
 
 360               style=None, html_toc=False,
 
 361               sample=None, cover=None, flags=None):
 
 362     """ produces a EPUB file
 
 364     sample=n: generate sample e-book (with at least n paragraphs)
 
 365     cover: a cover.Cover factory or True for default
 
 366     flags: less-advertising, without-fonts, working-copy, with-full-fonts
 
 369     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 
 370         """ processes one input file and proceeds to its children """
 
 372         replace_characters(wldoc.edoc.getroot())
 
 374         # every input file will have a TOC entry,
 
 375         # pointing to starting chunk
 
 376         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 
 379             # write book title page
 
 380             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 
 381             chars = used_chars(html_tree.getroot())
 
 382             zip.writestr('OPS/title.html',
 
 383                  etree.tostring(html_tree, method="html", pretty_print=True))
 
 384             # add a title page TOC entry
 
 385             toc.add(u"Strona tytułowa", "title.html")
 
 386         elif wldoc.book_info.parts:
 
 387             # write title page for every parent
 
 388             if sample is not None and sample <= 0:
 
 390                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 
 392                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 
 393                 chars = used_chars(html_tree.getroot())
 
 394                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 
 395             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 
 396             add_to_manifest(manifest, chunk_counter)
 
 397             add_to_spine(spine, chunk_counter)
 
 400         if len(wldoc.edoc.getroot()) > 1:
 
 401             # rdf before style master
 
 402             main_text = wldoc.edoc.getroot()[1]
 
 404             # rdf in style master
 
 405             main_text = wldoc.edoc.getroot()[0]
 
 406             if main_text.tag == RDFNS('RDF'):
 
 409         if main_text is not None:
 
 410             for chunk_xml in chop(main_text):
 
 412                 if sample is not None:
 
 416                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 
 417                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 
 419                 toc.extend(chunk_toc)
 
 420                 chars = chars.union(chunk_chars)
 
 421                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 
 422                 add_to_manifest(manifest, chunk_counter)
 
 423                 add_to_spine(spine, chunk_counter)
 
 426         for child in wldoc.parts():
 
 427             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 
 428                 child, chunk_counter, first=False, sample=sample)
 
 429             toc.append(child_toc)
 
 430             chars = chars.union(chunk_chars)
 
 432         return toc, chunk_counter, chars, sample
 
 435     document = deepcopy(wldoc)
 
 440             document.edoc.getroot().set(flag, 'yes')
 
 443     document.edoc.getroot().set('editors', u', '.join(sorted(
 
 444         editor.readable() for editor in document.editors())))
 
 445     if document.book_info.funders:
 
 446         document.edoc.getroot().set('funders', u', '.join(
 
 447             document.book_info.funders))
 
 448     if document.book_info.thanks:
 
 449         document.edoc.getroot().set('thanks', document.book_info.thanks)
 
 451     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 
 452     manifest = opf.find('.//' + OPFNS('manifest'))
 
 453     guide = opf.find('.//' + OPFNS('guide'))
 
 454     spine = opf.find('.//' + OPFNS('spine'))
 
 456     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 
 457     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 
 459     # write static elements
 
 460     mime = zipfile.ZipInfo()
 
 461     mime.filename = 'mimetype'
 
 462     mime.compress_type = zipfile.ZIP_STORED
 
 464     zip.writestr(mime, 'application/epub+zip')
 
 465     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 
 466                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 
 467                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 
 468                        'media-type="application/oebps-package+xml" />' \
 
 469                        '</rootfiles></container>')
 
 470     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 
 471     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 
 473         style = get_resource('epub/style.css')
 
 474     zip.write(style, os.path.join('OPS', 'style.css'))
 
 478             cover = DefaultEbookCover
 
 480         cover_file = StringIO()
 
 481         bound_cover = cover(document.book_info)
 
 482         bound_cover.save(cover_file)
 
 483         cover_name = 'cover.%s' % bound_cover.ext()
 
 484         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
 
 487         cover_tree = etree.parse(get_resource('epub/cover.html'))
 
 488         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
 
 489         zip.writestr('OPS/cover.html', etree.tostring(
 
 490                         cover_tree, method="html", pretty_print=True))
 
 492         if bound_cover.uses_dc_cover:
 
 493             if document.book_info.cover_by:
 
 494                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 
 495             if document.book_info.cover_source:
 
 496                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 
 498         manifest.append(etree.fromstring(
 
 499             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 
 500         manifest.append(etree.fromstring(
 
 501             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
 
 502         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 
 503         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 
 504         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 
 507     annotations = etree.Element('annotations')
 
 509     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 
 510                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 
 511                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 
 512                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 
 514     nav_map = toc_file[-1]
 
 517         manifest.append(etree.fromstring(
 
 518             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 
 519         spine.append(etree.fromstring(
 
 520             '<itemref idref="html_toc" />'))
 
 521         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
 
 523     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 
 525     if len(toc.children) < 2:
 
 526         toc.add(u"Początek utworu", "part1.html")
 
 528     # Last modifications in container files and EPUB creation
 
 529     if len(annotations) > 0:
 
 530         toc.add("Przypisy", "annotations.html")
 
 531         manifest.append(etree.fromstring(
 
 532             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 
 533         spine.append(etree.fromstring(
 
 534             '<itemref idref="annotations" />'))
 
 535         replace_by_verse(annotations)
 
 536         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 
 537         chars = chars.union(used_chars(html_tree.getroot()))
 
 538         zip.writestr('OPS/annotations.html', etree.tostring(
 
 539                             html_tree, method="html", pretty_print=True))
 
 541     toc.add("Wesprzyj Wolne Lektury", "support.html")
 
 542     manifest.append(etree.fromstring(
 
 543         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
 
 544     spine.append(etree.fromstring(
 
 545         '<itemref idref="support" />'))
 
 546     html_string = open(get_resource('epub/support.html')).read()
 
 547     chars.update(used_chars(etree.fromstring(html_string)))
 
 548     zip.writestr('OPS/support.html', html_string)
 
 550     toc.add("Strona redakcyjna", "last.html")
 
 551     manifest.append(etree.fromstring(
 
 552         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 
 553     spine.append(etree.fromstring(
 
 554         '<itemref idref="last" />'))
 
 555     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 
 556     chars.update(used_chars(html_tree.getroot()))
 
 557     zip.writestr('OPS/last.html', etree.tostring(
 
 558                         html_tree, method="html", pretty_print=True))
 
 560     if not flags or not 'without-fonts' in flags:
 
 562         tmpdir = mkdtemp('-librarian-epub')
 
 568         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 
 569         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 
 570             if not flags or not 'with-full-fonts' in flags:
 
 571                 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 
 572                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]              
 
 574                     print "Running font-optimizer"
 
 575                     subprocess.check_call(optimizer_call)
 
 577                     subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
 578                     zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 
 580                 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
 
 581             manifest.append(etree.fromstring(
 
 582                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
 
 586     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
 
 587     title = document.book_info.title
 
 588     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 
 589     for st in attributes:
 
 590         meta = toc_file.makeelement(NCXNS('meta'))
 
 592         meta.set('content', '0')
 
 593         toc_file[0].append(meta)
 
 594     toc_file[0][0].set('content', str(document.book_info.url))
 
 595     toc_file[0][1].set('content', str(toc.depth()))
 
 596     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 
 600         toc.add(u"Spis treści", "toc.html", index=1)
 
 601         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 
 602     toc.write_to_xml(nav_map)
 
 603     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
 
 606     return OutputFile.from_filename(output_file.name)