1 # -*- coding: utf-8 -*-
 
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 
   6 from __future__ import with_statement
 
  12 from StringIO import StringIO
 
  13 from copy import deepcopy
 
  14 from lxml import etree
 
  16 from tempfile import mkdtemp, NamedTemporaryFile
 
  17 from shutil import rmtree
 
  18 from mimetypes import guess_type
 
  20 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
 
  21 from librarian.cover import WLCover, FutureOfCopyrightCover
 
  22 from librarian.latex import LatexFragment
 
  23 from librarian import functions, get_resource
 
  25 functions.reg_person_name()
 
  29     """ returns node's text and children as a string
 
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
 
  35     nt = node.text if node.text is not None else ''
 
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
 
  38 def set_inner_xml(node, text):
 
  39     """ sets node's text and children from a string
 
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
 
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
 
  43     >>> print etree.tostring(e)
 
  47     p = etree.fromstring('<x>%s</x>' % text)
 
  53     """ Find out a node's name
 
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
 
  59     tempnode = deepcopy(node)
 
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
 
  62         for e in tempnode.findall('.//%s' % p):
 
  66     etree.strip_tags(tempnode, '*')
 
  71     if isinstance(xml, etree._Element):
 
  72         xml = etree.ElementTree(xml)
 
  73     with open(sheet) as xsltf:
 
  74         return xml.xslt(etree.parse(xsltf))
 
  77 def replace_characters(node):
 
  78     def replace_chars(text):
 
  81         return text.replace(u"\ufeff", u"")\
 
  82                    .replace("---", u"\u2014")\
 
  83                    .replace("--", u"\u2013")\
 
  84                    .replace(",,", u"\u201E")\
 
  85                    .replace('"', u"\u201D")\
 
  86                    .replace("'", u"\u2019")
 
  87     if node.tag in ('uwaga', 'extra'):
 
  91     node.text = replace_chars(node.text)
 
  92     node.tail = replace_chars(node.tail)
 
  94         replace_characters(child)
 
  97 def find_annotations(annotations, source, part_no):
 
  99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 
 100             annotation = deepcopy(child)
 
 101             number = str(len(annotations)+1)
 
 102             annotation.set('number', number)
 
 103             annotation.set('part', str(part_no))
 
 105             annotations.append(annotation)
 
 110         if child.tag not in ('extra', 'uwaga'):
 
 111             find_annotations(annotations, child, part_no)
 
 114 class Stanza(object):
 
 116     Converts / verse endings into verse elements in a stanza.
 
 118     Slashes may only occur directly in the stanza. Any slashes in subelements
 
 119     will be ignored, and the subelements will be put inside verse elements.
 
 121     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
 
 122     >>> Stanza(s).versify()
 
 123     >>> print etree.tostring(s)
 
 124     <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
 
 125     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 
 128     def __init__(self, stanza_elem):
 
 129         self.stanza = stanza_elem
 
 131         self.open_verse = None
 
 134         self.push_text(self.stanza.text)
 
 135         for elem in self.stanza:
 
 137             self.push_text(elem.tail)
 
 138         tail = self.stanza.tail
 
 140         self.stanza.tail = tail
 
 141         self.stanza.extend(self.verses)
 
 143     def open_normal_verse(self):
 
 144         self.open_verse = self.stanza.makeelement("wers_normalny")
 
 145         self.verses.append(self.open_verse)
 
 147     def get_open_verse(self):
 
 148         if self.open_verse is None:
 
 149             self.open_normal_verse()
 
 150         return self.open_verse
 
 152     def push_text(self, text):
 
 155         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 
 157                 self.open_normal_verse()
 
 158             verse = self.get_open_verse()
 
 160                 verse[-1].tail = (verse[-1].tail or "") + verse_text
 
 162                 verse.text = (verse.text or "") + verse_text
 
 164     def push_elem(self, elem):
 
 165         if elem.tag.startswith("wers"):
 
 166             verse = deepcopy(elem)
 
 168             self.verses.append(verse)
 
 169             self.open_verse = verse
 
 171             appended = deepcopy(elem)
 
 173             self.get_open_verse().append(appended)
 
 176 def replace_by_verse(tree):
 
 177     """ Find stanzas and create new verses in place of a '/' character """
 
 179     stanzas = tree.findall('.//' + WLNS('strofa'))
 
 180     for stanza in stanzas:
 
 181         Stanza(stanza).versify()
 
 184 def add_to_manifest(manifest, partno):
 
 185     """ Adds a node to the manifest section in content.opf file """
 
 187     partstr = 'part%d' % partno
 
 188     e = manifest.makeelement(OPFNS('item'), attrib={
 
 190                                  'href': partstr + '.html',
 
 191                                  'media-type': 'application/xhtml+xml',
 
 196 def add_to_spine(spine, partno):
 
 197     """ Adds a node to the spine section in content.opf file """
 
 199     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 
 204     def __init__(self, name=None, part_href=None):
 
 207         self.part_href = part_href
 
 208         self.sub_number = None
 
 210     def add(self, name, part_href, level=0, is_part=True, index=None):
 
 211         assert level == 0 or index is None
 
 212         if level > 0 and self.children:
 
 213             return self.children[-1].add(name, part_href, level-1, is_part)
 
 216             t.part_href = part_href
 
 217             if index is not None:
 
 218                 self.children.insert(index, t)
 
 220                 self.children.append(t)
 
 222                 t.sub_number = len(self.children) + 1
 
 225     def append(self, toc):
 
 226         self.children.append(toc)
 
 228     def extend(self, toc):
 
 229         self.children.extend(toc.children)
 
 233             return max((c.depth() for c in self.children)) + 1
 
 239         if self.sub_number is not None:
 
 240             src += '#sub%d' % self.sub_number
 
 243     def write_to_xml(self, nav_map, counter=1):
 
 244         for child in self.children:
 
 245             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 
 246             nav_point.set('id', 'NavPoint-%d' % counter)
 
 247             nav_point.set('playOrder', str(counter))
 
 249             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 
 250             text = nav_map.makeelement(NCXNS('text'))
 
 251             text.text = child.name
 
 252             nav_label.append(text)
 
 253             nav_point.append(nav_label)
 
 255             content = nav_map.makeelement(NCXNS('content'))
 
 256             content.set('src', child.href())
 
 257             nav_point.append(content)
 
 258             nav_map.append(nav_point)
 
 259             counter = child.write_to_xml(nav_point, counter + 1)
 
 262     def html_part(self, depth=0):
 
 264         for child in self.children:
 
 266                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 
 267                 (depth, child.href(), child.name))
 
 268             texts.append(child.html_part(depth+1))
 
 269         return "\n".join(texts)
 
 272         with open(get_resource('epub/toc.html')) as f:
 
 273             t = unicode(f.read(), 'utf-8')
 
 274         return t % self.html_part()
 
 277 def used_chars(element):
 
 278     """ Lists characters used in an ETree Element """
 
 279     chars = set((element.text or '') + (element.tail or ''))
 
 280     for child in element:
 
 281         chars = chars.union(used_chars(child))
 
 286     """ divide main content of the XML file into chunks """
 
 288     # prepare a container for each chunk
 
 289     part_xml = etree.Element('utwor')
 
 290     etree.SubElement(part_xml, 'master')
 
 291     main_xml_part = part_xml[0] # master
 
 293     last_node_part = False
 
 294     for one_part in main_text:
 
 296         if name == 'naglowek_czesc':
 
 298             last_node_part = True
 
 299             main_xml_part[:] = [deepcopy(one_part)]
 
 300         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 
 302             main_xml_part[:] = [deepcopy(one_part)]
 
 304             main_xml_part.append(deepcopy(one_part))
 
 305             last_node_part = False
 
 309 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 
 310     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 
 313     for element in chunk_xml[0]:
 
 314         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 
 315             toc.add(node_name(element), "part%d.html" % chunk_no)
 
 316         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 
 317             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 
 318             element.set('sub', str(subnumber))
 
 320         if not _empty_html_static:
 
 321             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 
 323         output_html = _empty_html_static[0]
 
 325         find_annotations(annotations, chunk_xml, chunk_no)
 
 326         replace_by_verse(chunk_xml)
 
 327         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 
 328         chars = used_chars(html_tree.getroot())
 
 329         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 
 330     return output_html, toc, chars
 
 333 def flatten_image_paths(wldoc):
 
 334     root = wldoc.edoc.getroot()
 
 335     for node in root.findall(".//ilustr"):
 
 336         node.attrib['src'] = os.path.basename(node.attrib['src'])
 
 339 def render_latex(wldoc, prefix="latex"):
 
 341     Renders <latex>CODE</latex> as images and returns
 
 342     (changed_wldoc, [ (epub_filepath1, latexfragment_object1), ... ]
 
 344     root = wldoc.edoc.getroot()
 
 345     latex_nodes = root.findall(".//latex")
 
 347     for ln in latex_nodes:
 
 348         fragment = LatexFragment(ln.text, resize=40)
 
 349         images.append((os.path.join(prefix, fragment.filename), fragment))
 
 351         ln.text = os.path.join(prefix, fragment.filename)
 
 356 def transform(wldoc, verbose=False,
 
 357               style=None, html_toc=False,
 
 358               sample=None, cover=None, flags=None, resources=None,
 
 359               intro_file=None, cover_file=None):
 
 360     """ produces a EPUB file
 
 362     sample=n: generate sample e-book (with at least n paragraphs)
 
 363     cover: a cover.Cover factory or True for default
 
 364     flags: less-advertising, without-fonts, working-copy
 
 367     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 
 368         """ processes one input file and proceeds to its children """
 
 370         replace_characters(wldoc.edoc.getroot())
 
 372         # every input file will have a TOC entry,
 
 373         # pointing to starting chunk
 
 374         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 
 377             # write book title page
 
 378             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 
 379             chars = used_chars(html_tree.getroot())
 
 380             zip.writestr('OPS/title.html',
 
 381                  etree.tostring(html_tree, method="html", pretty_print=True))
 
 382             # add a title page TOC entry
 
 383             toc.add(u"Tytuł", "title.html")
 
 384         elif wldoc.book_info.parts:
 
 385             # write title page for every parent
 
 386             if sample is not None and sample <= 0:
 
 388                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 
 390                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 
 391                 chars = used_chars(html_tree.getroot())
 
 392                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 
 393             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 
 394             add_to_manifest(manifest, chunk_counter)
 
 395             add_to_spine(spine, chunk_counter)
 
 398         if len(wldoc.edoc.getroot()) > 1:
 
 399             # rdf before style master
 
 400             main_text = wldoc.edoc.getroot()[1]
 
 402             # rdf in style master
 
 403             main_text = wldoc.edoc.getroot()[0]
 
 404             if main_text.tag == RDFNS('RDF'):
 
 407         flatten_image_paths(wldoc)
 
 409         if main_text is not None:
 
 410             for chunk_xml in chop(main_text):
 
 412                 if sample is not None:
 
 416                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 
 417                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 
 419                 toc.extend(chunk_toc)
 
 420                 chars = chars.union(chunk_chars)
 
 421                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 
 422                 add_to_manifest(manifest, chunk_counter)
 
 423                 add_to_spine(spine, chunk_counter)
 
 426         for child in wldoc.parts():
 
 427             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 
 428                 child, chunk_counter, first=False, sample=sample)
 
 429             toc.append(child_toc)
 
 430             chars = chars.union(chunk_chars)
 
 432         return toc, chunk_counter, chars, sample
 
 435     document = deepcopy(wldoc)
 
 440             document.edoc.getroot().set(flag, 'yes')
 
 443     document.edoc.getroot().set('editors', u', '.join(sorted(
 
 444         editor.readable() for editor in document.editors())))
 
 446     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 
 447     manifest = opf.find('.//' + OPFNS('manifest'))
 
 448     guide = opf.find('.//' + OPFNS('guide'))
 
 449     spine = opf.find('.//' + OPFNS('spine'))
 
 451     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 
 452     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 
 454     # write static elements
 
 455     mime = zipfile.ZipInfo()
 
 456     mime.filename = 'mimetype'
 
 457     mime.compress_type = zipfile.ZIP_STORED
 
 459     zip.writestr(mime, 'application/epub+zip')
 
 460     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 
 461                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 
 462                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 
 463                        'media-type="application/oebps-package+xml" />' \
 
 464                        '</rootfiles></container>')
 
 465     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 
 466     zip.write(get_resource('res/logo.png'), os.path.join('OPS', 'logo.png'))
 
 467     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 
 469         style = get_resource('epub/style.css')
 
 470     zip.write(style, os.path.join('OPS', 'style.css'))
 
 472     document, latex_images = render_latex(document)
 
 473     for image in latex_images:
 
 474         zip.write(image[1].path, os.path.join('OPS', image[0]))
 
 478         if os.path.isdir(resources):
 
 479             for dp, dirs, files in os.walk(resources):
 
 481                     fpath  = os.path.join(dp, fname)
 
 482                     if os.path.isfile(fpath):
 
 483                         zip.write(fpath, os.path.join('OPS', fname))
 
 484                         manifest.append(etree.fromstring(
 
 485                                 '<item id="%s" href="%s" media-type="%s" />' % (os.path.splitext(fname)[0], fname, guess_type(fpath)[0])))
 
 488             print "resources path %s is not directory" % resources
 
 493             cover = FutureOfCopyrightCover
 
 495         cover_file = StringIO()
 
 496         c = cover(document.book_info)
 
 498         c_name = 'cover.%s' % c.ext()
 
 499         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 
 502         cover_tree = etree.parse(get_resource('epub/cover.html'))
 
 503         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 
 504         zip.writestr('OPS/cover.html', etree.tostring(
 
 505                         cover_tree, method="html", pretty_print=True))
 
 508             if document.book_info.cover_by:
 
 509                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 
 510             if document.book_info.cover_source:
 
 511                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 
 513         manifest.append(etree.fromstring(
 
 514             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 
 515         manifest.append(etree.fromstring(
 
 516             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 
 517         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 
 518         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 
 519         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 
 523     annotations = etree.Element('annotations')
 
 525     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 
 526                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 
 527                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 
 528                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 
 530     nav_map = toc_file[-1]
 
 532     manifest.append(etree.fromstring(
 
 533         '<item id="first" href="first.html" media-type="application/xhtml+xml" />'))
 
 534     spine.append(etree.fromstring(
 
 535         '<itemref idref="first" />'))
 
 536     html_tree = xslt(document.edoc, get_resource('epub/xsltFirst.xsl'))
 
 537 #    chars.update(used_chars(html_tree.getroot()))
 
 538     zip.writestr('OPS/first.html', etree.tostring(
 
 539                         html_tree, method="html", pretty_print=True))
 
 542         manifest.append(etree.fromstring(
 
 543                 '<item id="intro" href="intro.html" media-type="application/xhtml+xml" />'))
 
 544         spine.append(etree.fromstring(
 
 545                 '<itemref idref="intro" />'))
 
 546         zip.writestr('OPS/intro.html', open(intro_file or get_resource('epub/intro.html')).read())
 
 550         manifest.append(etree.fromstring(
 
 551             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 
 552         spine.append(etree.fromstring(
 
 553             '<itemref idref="html_toc" />'))
 
 554         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Table of Contents"/>'))
 
 556     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 
 558     toc.add("Informacje redakcyjne", "first.html", index=0)
 
 560     if len(toc.children) < 2:
 
 561         toc.add(u"Początek książki", "part1.html")
 
 563     # Last modifications in container files and EPUB creation
 
 564     if len(annotations) > 0:
 
 565         toc.add("Przypisy", "annotations.html")
 
 566         manifest.append(etree.fromstring(
 
 567             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 
 568         spine.append(etree.fromstring(
 
 569             '<itemref idref="annotations" />'))
 
 570         replace_by_verse(annotations)
 
 571         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 
 572         chars = chars.union(used_chars(html_tree.getroot()))
 
 573         zip.writestr('OPS/annotations.html', etree.tostring(
 
 574                             html_tree, method="html", pretty_print=True))
 
 576     # toc.add("Weprzyj Wolne Lektury", "support.html")
 
 577     # manifest.append(etree.fromstring(
 
 578     #     '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
 
 579     # spine.append(etree.fromstring(
 
 580     #     '<itemref idref="support" />'))
 
 581     # html_string = open(get_resource('epub/support.html')).read()
 
 582     # chars.update(used_chars(etree.fromstring(html_string)))
 
 583     # zip.writestr('OPS/support.html', html_string)
 
 585     toc.add("Informacje redakcyjne", "last.html")
 
 586     manifest.append(etree.fromstring(
 
 587         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 
 588     spine.append(etree.fromstring(
 
 589         '<itemref idref="last" />'))
 
 590     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 
 591     chars.update(used_chars(html_tree.getroot()))
 
 592     zip.writestr('OPS/last.html', etree.tostring(
 
 593                         html_tree, method="html", pretty_print=True))
 
 595     if not flags or not 'without-fonts' in flags:
 
 597         tmpdir = mkdtemp('-librarian-epub')
 
 603         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 
 604         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 
 605             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 
 606                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 
 608                 print "Running font-optimizer"
 
 609                 subprocess.check_call(optimizer_call)
 
 611                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
 612             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 
 613             manifest.append(etree.fromstring(
 
 614                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 
 619     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 
 620     title = document.book_info.title
 
 621     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 
 622     for st in attributes:
 
 623         meta = toc_file.makeelement(NCXNS('meta'))
 
 625         meta.set('content', '0')
 
 626         toc_file[0].append(meta)
 
 627     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 
 628     toc_file[0][1].set('content', str(toc.depth()))
 
 629     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 
 633         toc.add(u"Spis treści", "toc.html", index=1)
 
 634         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 
 635     toc.write_to_xml(nav_map)
 
 636     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 
 639     return OutputFile.from_filename(output_file.name)