librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import re
  11 import subprocess
  12 from StringIO import StringIO
  13 from copy import deepcopy
  14 from lxml import etree
  15 import zipfile
  16 from tempfile import mkdtemp, NamedTemporaryFile
  17 from shutil import rmtree
  18
  19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
  20 from librarian.cover import DefaultEbookCover
  21
  22 from librarian import functions, get_resource
  23
  24 functions.reg_person_name()
  25 functions.reg_lang_code_3to2()
  26
  27
  28 def inner_xml(node):
  29     """ returns node's text and children as a string
  30
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  32     x<b>y</b>z
  33     """
  34
  35     nt = node.text if node.text is not None else ''
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
  37
  38 def set_inner_xml(node, text):
  39     """ sets node's text and children from a string
  40
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
  43     >>> print etree.tostring(e)
  44     <a>x<b>y</b>z</a>
  45     """
  46
  47     p = etree.fromstring('<x>%s</x>' % text)
  48     node.text = p.text
  49     node[:] = p[:]
  50
  51
  52 def node_name(node):
  53     """ Find out a node's name
  54
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  56     XYZ
  57     """
  58
  59     tempnode = deepcopy(node)
  60
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  62         for e in tempnode.findall('.//%s' % p):
  63             t = e.tail
  64             e.clear()
  65             e.tail = t
  66     etree.strip_tags(tempnode, '*')
  67     return tempnode.text
  68
  69
  70 def xslt(xml, sheet):
  71     if isinstance(xml, etree._Element):
  72         xml = etree.ElementTree(xml)
  73     with open(sheet) as xsltf:
  74         return xml.xslt(etree.parse(xsltf))
  75
  76
  77 def replace_characters(node):
  78     def replace_chars(text):
  79         if text is None:
  80             return None
  81         return text.replace(u"\ufeff", u"")\
  82                    .replace("---", u"\u2014")\
  83                    .replace("--", u"\u2013")\
  84                    .replace(",,", u"\u201E")\
  85                    .replace('"', u"\u201D")\
  86                    .replace("'", u"\u2019")
  87     if node.tag in ('uwaga', 'extra'):
  88         t = node.tail
  89         node.clear()
  90         node.tail = t
  91     node.text = replace_chars(node.text)
  92     node.tail = replace_chars(node.tail)
  93     for child in node:
  94         replace_characters(child)
  95
  96
  97 def find_annotations(annotations, source, part_no):
  98     for child in source:
  99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 100             annotation = deepcopy(child)
 101             number = str(len(annotations)+1)
 102             annotation.set('number', number)
 103             annotation.set('part', str(part_no))
 104             annotation.tail = ''
 105             annotations.append(annotation)
 106             tail = child.tail
 107             child.clear()
 108             child.tail = tail
 109             child.text = number
 110         if child.tag not in ('extra', 'uwaga'):
 111             find_annotations(annotations, child, part_no)
 112
 113
 114 class Stanza(object):
 115     """
 116     Converts / verse endings into verse elements in a stanza.
 117
 118     Slashes may only occur directly in the stanza. Any slashes in subelements
 119     will be ignored, and the subelements will be put inside verse elements.
 120
 121     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
 122     >>> Stanza(s).versify()
 123     >>> print etree.tostring(s)
 124     <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
 125     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 126
 127     """
 128     def __init__(self, stanza_elem):
 129         self.stanza = stanza_elem
 130         self.verses = []
 131         self.open_verse = None
 132
 133     def versify(self):
 134         self.push_text(self.stanza.text)
 135         for elem in self.stanza:
 136             self.push_elem(elem)
 137             self.push_text(elem.tail)
 138         tail = self.stanza.tail
 139         self.stanza.clear()
 140         self.stanza.tail = tail
 141         self.stanza.extend(self.verses)
 142
 143     def open_normal_verse(self):
 144         self.open_verse = self.stanza.makeelement("wers_normalny")
 145         self.verses.append(self.open_verse)
 146
 147     def get_open_verse(self):
 148         if self.open_verse is None:
 149             self.open_normal_verse()
 150         return self.open_verse
 151
 152     def push_text(self, text):
 153         if not text:
 154             return
 155         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 156             if i:
 157                 self.open_normal_verse()
 158             verse = self.get_open_verse()
 159             if len(verse):
 160                 verse[-1].tail = (verse[-1].tail or "") + verse_text
 161             else:
 162                 verse.text = (verse.text or "") + verse_text
 163
 164     def push_elem(self, elem):
 165         if elem.tag.startswith("wers"):
 166             verse = deepcopy(elem)
 167             verse.tail = None
 168             self.verses.append(verse)
 169             self.open_verse = verse
 170         else:
 171             appended = deepcopy(elem)
 172             appended.tail = None
 173             self.get_open_verse().append(appended)
 174
 175
 176 def replace_by_verse(tree):
 177     """ Find stanzas and create new verses in place of a '/' character """
 178
 179     stanzas = tree.findall('.//' + WLNS('strofa'))
 180     for stanza in stanzas:
 181         Stanza(stanza).versify()
 182
 183
 184 def add_to_manifest(manifest, partno):
 185     """ Adds a node to the manifest section in content.opf file """
 186
 187     partstr = 'part%d' % partno
 188     e = manifest.makeelement(OPFNS('item'), attrib={
 189                                  'id': partstr,
 190                                  'href': partstr + '.html',
 191                                  'media-type': 'application/xhtml+xml',
 192                              })
 193     manifest.append(e)
 194
 195
 196 def add_to_spine(spine, partno):
 197     """ Adds a node to the spine section in content.opf file """
 198
 199     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 200     spine.append(e)
 201
 202
 203 class TOC(object):
 204     def __init__(self, name=None, part_href=None):
 205         self.children = []
 206         self.name = name
 207         self.part_href = part_href
 208         self.sub_number = None
 209
 210     def add(self, name, part_href, level=0, is_part=True, index=None):
 211         assert level == 0 or index is None
 212         if level > 0 and self.children:
 213             return self.children[-1].add(name, part_href, level-1, is_part)
 214         else:
 215             t = TOC(name)
 216             t.part_href = part_href
 217             if index is not None:
 218                 self.children.insert(index, t)
 219             else:
 220                 self.children.append(t)
 221             if not is_part:
 222                 t.sub_number = len(self.children) + 1
 223                 return t.sub_number
 224
 225     def append(self, toc):
 226         self.children.append(toc)
 227
 228     def extend(self, toc):
 229         self.children.extend(toc.children)
 230
 231     def depth(self):
 232         if self.children:
 233             return max((c.depth() for c in self.children)) + 1
 234         else:
 235             return 0
 236
 237     def href(self):
 238         src = self.part_href
 239         if self.sub_number is not None:
 240             src += '#sub%d' % self.sub_number
 241         return src
 242
 243     def write_to_xml(self, nav_map, counter=1):
 244         for child in self.children:
 245             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 246             nav_point.set('id', 'NavPoint-%d' % counter)
 247             nav_point.set('playOrder', str(counter))
 248
 249             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 250             text = nav_map.makeelement(NCXNS('text'))
 251             text.text = child.name
 252             nav_label.append(text)
 253             nav_point.append(nav_label)
 254
 255             content = nav_map.makeelement(NCXNS('content'))
 256             content.set('src', child.href())
 257             nav_point.append(content)
 258             nav_map.append(nav_point)
 259             counter = child.write_to_xml(nav_point, counter + 1)
 260         return counter
 261
 262     def html_part(self, depth=0):
 263         texts = []
 264         for child in self.children:
 265             texts.append(
 266                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 267                 (depth, child.href(), child.name))
 268             texts.append(child.html_part(depth+1))
 269         return "\n".join(texts)
 270
 271     def html(self):
 272         with open(get_resource('epub/toc.html')) as f:
 273             t = unicode(f.read(), 'utf-8')
 274         return t % self.html_part()
 275
 276
 277 def used_chars(element):
 278     """ Lists characters used in an ETree Element """
 279     chars = set((element.text or '') + (element.tail or ''))
 280     for child in element:
 281         chars = chars.union(used_chars(child))
 282     return chars
 283
 284
 285 def chop(main_text):
 286     """ divide main content of the XML file into chunks """
 287
 288     # prepare a container for each chunk
 289     part_xml = etree.Element('utwor')
 290     etree.SubElement(part_xml, 'master')
 291     main_xml_part = part_xml[0] # master
 292
 293     last_node_part = False
 294     for one_part in main_text:
 295         name = one_part.tag
 296         if name == 'naglowek_czesc':
 297             yield part_xml
 298             last_node_part = True
 299             main_xml_part[:] = [deepcopy(one_part)]
 300         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 301             yield part_xml
 302             main_xml_part[:] = [deepcopy(one_part)]
 303         else:
 304             main_xml_part.append(deepcopy(one_part))
 305             last_node_part = False
 306     yield part_xml
 307
 308
 309 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 310     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 311
 312     toc = TOC()
 313     for element in chunk_xml[0]:
 314         if element.tag in ("naglowek_czesc"):
 315             toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
 316         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 317             toc.add(node_name(element), "part%d.html" % chunk_no)
 318         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 319             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 320             element.set('sub', str(subnumber))
 321     if empty:
 322         if not _empty_html_static:
 323             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 324         chars = set()
 325         output_html = _empty_html_static[0]
 326     else:
 327         find_annotations(annotations, chunk_xml, chunk_no)
 328         replace_by_verse(chunk_xml)
 329         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 330         chars = used_chars(html_tree.getroot())
 331         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 332     return output_html, toc, chars
 333
 334
 335 def transform(wldoc, verbose=False,
 336               style=None, html_toc=False,
 337               sample=None, cover=None, flags=None):
 338     """ produces a EPUB file
 339
 340     sample=n: generate sample e-book (with at least n paragraphs)
 341     cover: a cover.Cover factory or True for default
 342     flags: less-advertising, without-fonts, working-copy, with-full-fonts
 343     """
 344
 345     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 346         """ processes one input file and proceeds to its children """
 347
 348         replace_characters(wldoc.edoc.getroot())
 349
 350         # every input file will have a TOC entry,
 351         # pointing to starting chunk
 352         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 353         chars = set()
 354         if first:
 355             # write book title page
 356             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 357             chars = used_chars(html_tree.getroot())
 358             zip.writestr('OPS/title.html',
 359                  etree.tostring(html_tree, method="html", pretty_print=True))
 360             # add a title page TOC entry
 361             toc.add(u"Strona tytułowa", "title.html")
 362         elif wldoc.book_info.parts:
 363             # write title page for every parent
 364             if sample is not None and sample <= 0:
 365                 chars = set()
 366                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 367             else:
 368                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 369                 chars = used_chars(html_tree.getroot())
 370                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 371             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 372             add_to_manifest(manifest, chunk_counter)
 373             add_to_spine(spine, chunk_counter)
 374             chunk_counter += 1
 375
 376         if len(wldoc.edoc.getroot()) > 1:
 377             # rdf before style master
 378             main_text = wldoc.edoc.getroot()[1]
 379         else:
 380             # rdf in style master
 381             main_text = wldoc.edoc.getroot()[0]
 382             if main_text.tag == RDFNS('RDF'):
 383                 main_text = None
 384
 385         if main_text is not None:
 386             for chunk_xml in chop(main_text):
 387                 empty = False
 388                 if sample is not None:
 389                     if sample <= 0:
 390                         empty = True
 391                     else:
 392                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 393                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 394
 395                 toc.extend(chunk_toc)
 396                 chars = chars.union(chunk_chars)
 397                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 398                 add_to_manifest(manifest, chunk_counter)
 399                 add_to_spine(spine, chunk_counter)
 400                 chunk_counter += 1
 401
 402         for child in wldoc.parts():
 403             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 404                 child, chunk_counter, first=False, sample=sample)
 405             toc.append(child_toc)
 406             chars = chars.union(chunk_chars)
 407
 408         return toc, chunk_counter, chars, sample
 409
 410
 411     document = deepcopy(wldoc)
 412     del wldoc
 413
 414     if flags:
 415         for flag in flags:
 416             document.edoc.getroot().set(flag, 'yes')
 417
 418     # add editors info
 419     document.edoc.getroot().set('editors', u', '.join(sorted(
 420         editor.readable() for editor in document.editors())))
 421     if document.book_info.funders:
 422         document.edoc.getroot().set('funders', u', '.join(
 423             document.book_info.funders))
 424     if document.book_info.thanks:
 425         document.edoc.getroot().set('thanks', document.book_info.thanks)
 426
 427     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 428     manifest = opf.find('.//' + OPFNS('manifest'))
 429     guide = opf.find('.//' + OPFNS('guide'))
 430     spine = opf.find('.//' + OPFNS('spine'))
 431
 432     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 433     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 434
 435     # write static elements
 436     mime = zipfile.ZipInfo()
 437     mime.filename = 'mimetype'
 438     mime.compress_type = zipfile.ZIP_STORED
 439     mime.extra = ''
 440     zip.writestr(mime, 'application/epub+zip')
 441     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 442                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 443                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 444                        'media-type="application/oebps-package+xml" />' \
 445                        '</rootfiles></container>')
 446     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 447     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 448     if not style:
 449         style = get_resource('epub/style.css')
 450     zip.write(style, os.path.join('OPS', 'style.css'))
 451
 452     if cover:
 453         if cover is True:
 454             cover = DefaultEbookCover
 455
 456         cover_file = StringIO()
 457         bound_cover = cover(document.book_info)
 458         bound_cover.save(cover_file)
 459         cover_name = 'cover.%s' % bound_cover.ext()
 460         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
 461         del cover_file
 462
 463         cover_tree = etree.parse(get_resource('epub/cover.html'))
 464         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
 465         zip.writestr('OPS/cover.html', etree.tostring(
 466                         cover_tree, method="html", pretty_print=True))
 467
 468         if bound_cover.uses_dc_cover:
 469             if document.book_info.cover_by:
 470                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 471             if document.book_info.cover_source:
 472                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 473
 474         manifest.append(etree.fromstring(
 475             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 476         manifest.append(etree.fromstring(
 477             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
 478         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 479         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 480         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 481
 482
 483     annotations = etree.Element('annotations')
 484
 485     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 486                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 487                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 488                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 489                                '</navMap></ncx>')
 490     nav_map = toc_file[-1]
 491
 492     if html_toc:
 493         manifest.append(etree.fromstring(
 494             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 495         spine.append(etree.fromstring(
 496             '<itemref idref="html_toc" />'))
 497         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
 498
 499     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 500
 501     if len(toc.children) < 2:
 502         toc.add(u"Początek utworu", "part1.html")
 503
 504     # Last modifications in container files and EPUB creation
 505     if len(annotations) > 0:
 506         toc.add("Przypisy", "annotations.html")
 507         manifest.append(etree.fromstring(
 508             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 509         spine.append(etree.fromstring(
 510             '<itemref idref="annotations" />'))
 511         replace_by_verse(annotations)
 512         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 513         chars = chars.union(used_chars(html_tree.getroot()))
 514         zip.writestr('OPS/annotations.html', etree.tostring(
 515                             html_tree, method="html", pretty_print=True))
 516
 517     toc.add("Wesprzyj Wolne Lektury", "support.html")
 518     manifest.append(etree.fromstring(
 519         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
 520     spine.append(etree.fromstring(
 521         '<itemref idref="support" />'))
 522     html_string = open(get_resource('epub/support.html')).read()
 523     chars.update(used_chars(etree.fromstring(html_string)))
 524     zip.writestr('OPS/support.html', html_string)
 525
 526     toc.add("Strona redakcyjna", "last.html")
 527     manifest.append(etree.fromstring(
 528         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 529     spine.append(etree.fromstring(
 530         '<itemref idref="last" />'))
 531     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 532     chars.update(used_chars(html_tree.getroot()))
 533     zip.writestr('OPS/last.html', etree.tostring(
 534                         html_tree, method="html", pretty_print=True))
 535
 536     if not flags or not 'without-fonts' in flags:
 537         # strip fonts
 538         tmpdir = mkdtemp('-librarian-epub')
 539         try:
 540             cwd = os.getcwd()
 541         except OSError:
 542             cwd = None
 543
 544         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 545         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 546             if not flags or not 'with-full-fonts' in flags:
 547                 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 548                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 549                 if verbose:
 550                     print "Running font-optimizer"
 551                     subprocess.check_call(optimizer_call)
 552                 else:
 553                     subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 554                     zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 555             else:
 556                 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
 557             manifest.append(etree.fromstring(
 558                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
 559         rmtree(tmpdir)
 560         if cwd is not None:
 561             os.chdir(cwd)
 562     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
 563     title = document.book_info.title
 564     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 565     for st in attributes:
 566         meta = toc_file.makeelement(NCXNS('meta'))
 567         meta.set('name', st)
 568         meta.set('content', '0')
 569         toc_file[0].append(meta)
 570     toc_file[0][0].set('content', str(document.book_info.url))
 571     toc_file[0][1].set('content', str(toc.depth()))
 572     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 573
 574     # write TOC
 575     if html_toc:
 576         toc.add(u"Spis treści", "toc.html", index=1)
 577         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 578     toc.write_to_xml(nav_map)
 579     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
 580     zip.close()
 581
 582     return OutputFile.from_filename(output_file.name)