librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import re
  11 import subprocess
  12 from StringIO import StringIO
  13 from copy import deepcopy
  14 from lxml import etree
  15 import zipfile
  16 from tempfile import mkdtemp, NamedTemporaryFile
  17 from shutil import rmtree
  18
  19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
  20 from librarian.cover import WLCover, FutureOfCopyrightCover
  21
  22 from librarian import functions, get_resource
  23
  24 functions.reg_person_name()
  25
  26
  27 def inner_xml(node):
  28     """ returns node's text and children as a string
  29
  30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  31     x<b>y</b>z
  32     """
  33
  34     nt = node.text if node.text is not None else ''
  35     return ''.join([nt] + [etree.tostring(child) for child in node])
  36
  37 def set_inner_xml(node, text):
  38     """ sets node's text and children from a string
  39
  40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  41     >>> set_inner_xml(e, 'x<b>y</b>z')
  42     >>> print etree.tostring(e)
  43     <a>x<b>y</b>z</a>
  44     """
  45
  46     p = etree.fromstring('<x>%s</x>' % text)
  47     node.text = p.text
  48     node[:] = p[:]
  49
  50
  51 def node_name(node):
  52     """ Find out a node's name
  53
  54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  55     XYZ
  56     """
  57
  58     tempnode = deepcopy(node)
  59
  60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  61         for e in tempnode.findall('.//%s' % p):
  62             t = e.tail
  63             e.clear()
  64             e.tail = t
  65     etree.strip_tags(tempnode, '*')
  66     return tempnode.text
  67
  68
  69 def xslt(xml, sheet):
  70     if isinstance(xml, etree._Element):
  71         xml = etree.ElementTree(xml)
  72     with open(sheet) as xsltf:
  73         return xml.xslt(etree.parse(xsltf))
  74
  75
  76 def replace_characters(node):
  77     def replace_chars(text):
  78         if text is None:
  79             return None
  80         return text.replace(u"\ufeff", u"")\
  81                    .replace("---", u"\u2014")\
  82                    .replace("--", u"\u2013")\
  83                    .replace(",,", u"\u201E")\
  84                    .replace('"', u"\u201D")\
  85                    .replace("'", u"\u2019")
  86     if node.tag in ('uwaga', 'extra'):
  87         t = node.tail
  88         node.clear()
  89         node.tail = t
  90     node.text = replace_chars(node.text)
  91     node.tail = replace_chars(node.tail)
  92     for child in node:
  93         replace_characters(child)
  94
  95
  96 def find_annotations(annotations, source, part_no):
  97     for child in source:
  98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  99             annotation = deepcopy(child)
 100             number = str(len(annotations)+1)
 101             annotation.set('number', number)
 102             annotation.set('part', str(part_no))
 103             annotation.tail = ''
 104             annotations.append(annotation)
 105             tail = child.tail
 106             child.clear()
 107             child.tail = tail
 108             child.text = number
 109         if child.tag not in ('extra', 'uwaga'):
 110             find_annotations(annotations, child, part_no)
 111
 112
 113 class Stanza(object):
 114     """
 115     Converts / verse endings into verse elements in a stanza.
 116
 117     Slashes may only occur directly in the stanza. Any slashes in subelements
 118     will be ignored, and the subelements will be put inside verse elements.
 119
 120     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
 121     >>> Stanza(s).versify()
 122     >>> print etree.tostring(s)
 123     <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
 124     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 125
 126     """
 127     def __init__(self, stanza_elem):
 128         self.stanza = stanza_elem
 129         self.verses = []
 130         self.open_verse = None
 131
 132     def versify(self):
 133         self.push_text(self.stanza.text)
 134         for elem in self.stanza:
 135             self.push_elem(elem)
 136             self.push_text(elem.tail)
 137         tail = self.stanza.tail
 138         self.stanza.clear()
 139         self.stanza.tail = tail
 140         self.stanza.extend(self.verses)
 141
 142     def open_normal_verse(self):
 143         self.open_verse = self.stanza.makeelement("wers_normalny")
 144         self.verses.append(self.open_verse)
 145
 146     def get_open_verse(self):
 147         if self.open_verse is None:
 148             self.open_normal_verse()
 149         return self.open_verse
 150
 151     def push_text(self, text):
 152         if not text:
 153             return
 154         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 155             if i:
 156                 self.open_normal_verse()
 157             verse = self.get_open_verse()
 158             if len(verse):
 159                 verse[-1].tail = (verse[-1].tail or "") + verse_text
 160             else:
 161                 verse.text = (verse.text or "") + verse_text
 162
 163     def push_elem(self, elem):
 164         if elem.tag.startswith("wers"):
 165             verse = deepcopy(elem)
 166             verse.tail = None
 167             self.verses.append(verse)
 168             self.open_verse = verse
 169         else:
 170             appended = deepcopy(elem)
 171             appended.tail = None
 172             self.get_open_verse().append(appended)
 173
 174
 175 def replace_by_verse(tree):
 176     """ Find stanzas and create new verses in place of a '/' character """
 177
 178     stanzas = tree.findall('.//' + WLNS('strofa'))
 179     for stanza in stanzas:
 180         Stanza(stanza).versify()
 181
 182
 183 def add_to_manifest(manifest, partno):
 184     """ Adds a node to the manifest section in content.opf file """
 185
 186     partstr = 'part%d' % partno
 187     e = manifest.makeelement(OPFNS('item'), attrib={
 188                                  'id': partstr,
 189                                  'href': partstr + '.html',
 190                                  'media-type': 'application/xhtml+xml',
 191                              })
 192     manifest.append(e)
 193
 194
 195 def add_to_spine(spine, partno):
 196     """ Adds a node to the spine section in content.opf file """
 197
 198     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 199     spine.append(e)
 200
 201
 202 class TOC(object):
 203     def __init__(self, name=None, part_href=None):
 204         self.children = []
 205         self.name = name
 206         self.part_href = part_href
 207         self.sub_number = None
 208
 209     def add(self, name, part_href, level=0, is_part=True, index=None):
 210         assert level == 0 or index is None
 211         if level > 0 and self.children:
 212             return self.children[-1].add(name, part_href, level-1, is_part)
 213         else:
 214             t = TOC(name)
 215             t.part_href = part_href
 216             if index is not None:
 217                 self.children.insert(index, t)
 218             else:
 219                 self.children.append(t)
 220             if not is_part:
 221                 t.sub_number = len(self.children) + 1
 222                 return t.sub_number
 223
 224     def append(self, toc):
 225         self.children.append(toc)
 226
 227     def extend(self, toc):
 228         self.children.extend(toc.children)
 229
 230     def depth(self):
 231         if self.children:
 232             return max((c.depth() for c in self.children)) + 1
 233         else:
 234             return 0
 235
 236     def href(self):
 237         src = self.part_href
 238         if self.sub_number is not None:
 239             src += '#sub%d' % self.sub_number
 240         return src
 241
 242     def write_to_xml(self, nav_map, counter=1):
 243         for child in self.children:
 244             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 245             nav_point.set('id', 'NavPoint-%d' % counter)
 246             nav_point.set('playOrder', str(counter))
 247
 248             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 249             text = nav_map.makeelement(NCXNS('text'))
 250             text.text = child.name
 251             nav_label.append(text)
 252             nav_point.append(nav_label)
 253
 254             content = nav_map.makeelement(NCXNS('content'))
 255             content.set('src', child.href())
 256             nav_point.append(content)
 257             nav_map.append(nav_point)
 258             counter = child.write_to_xml(nav_point, counter + 1)
 259         return counter
 260
 261     def html_part(self, depth=0):
 262         texts = []
 263         for child in self.children:
 264             texts.append(
 265                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 266                 (depth, child.href(), child.name))
 267             texts.append(child.html_part(depth+1))
 268         return "\n".join(texts)
 269
 270     def html(self):
 271         with open(get_resource('epub/toc.html')) as f:
 272             t = unicode(f.read(), 'utf-8')
 273         return t % self.html_part()
 274
 275
 276 def used_chars(element):
 277     """ Lists characters used in an ETree Element """
 278     chars = set((element.text or '') + (element.tail or ''))
 279     for child in element:
 280         chars = chars.union(used_chars(child))
 281     return chars
 282
 283
 284 def chop(main_text):
 285     """ divide main content of the XML file into chunks """
 286
 287     # prepare a container for each chunk
 288     part_xml = etree.Element('utwor')
 289     etree.SubElement(part_xml, 'master')
 290     main_xml_part = part_xml[0] # master
 291
 292     last_node_part = False
 293     for one_part in main_text:
 294         name = one_part.tag
 295         if name == 'naglowek_czesc':
 296             yield part_xml
 297             last_node_part = True
 298             main_xml_part[:] = [deepcopy(one_part)]
 299         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 300             yield part_xml
 301             main_xml_part[:] = [deepcopy(one_part)]
 302         else:
 303             main_xml_part.append(deepcopy(one_part))
 304             last_node_part = False
 305     yield part_xml
 306
 307
 308 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 309     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 310
 311     toc = TOC()
 312     for element in chunk_xml[0]:
 313         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 314             toc.add(node_name(element), "part%d.html" % chunk_no)
 315         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 316             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 317             element.set('sub', str(subnumber))
 318     if empty:
 319         if not _empty_html_static:
 320             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 321         chars = set()
 322         output_html = _empty_html_static[0]
 323     else:
 324         find_annotations(annotations, chunk_xml, chunk_no)
 325         replace_by_verse(chunk_xml)
 326         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 327         chars = used_chars(html_tree.getroot())
 328         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 329     return output_html, toc, chars
 330
 331
 332 def transform(wldoc, verbose=False,
 333               style=None, html_toc=False,
 334               sample=None, cover=None, flags=None, resources=None):
 335     """ produces a EPUB file
 336
 337     sample=n: generate sample e-book (with at least n paragraphs)
 338     cover: a cover.Cover factory or True for default
 339     flags: less-advertising, without-fonts, working-copy
 340     """
 341
 342     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 343         """ processes one input file and proceeds to its children """
 344
 345         replace_characters(wldoc.edoc.getroot())
 346
 347         # every input file will have a TOC entry,
 348         # pointing to starting chunk
 349         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 350         chars = set()
 351         if first:
 352             # write book title page
 353             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 354             chars = used_chars(html_tree.getroot())
 355             zip.writestr('OPS/title.html',
 356                  etree.tostring(html_tree, method="html", pretty_print=True))
 357             # add a title page TOC entry
 358             toc.add(u"Title", "title.html")
 359         elif wldoc.book_info.parts:
 360             # write title page for every parent
 361             if sample is not None and sample <= 0:
 362                 chars = set()
 363                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 364             else:
 365                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 366                 chars = used_chars(html_tree.getroot())
 367                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 368             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 369             add_to_manifest(manifest, chunk_counter)
 370             add_to_spine(spine, chunk_counter)
 371             chunk_counter += 1
 372
 373         if len(wldoc.edoc.getroot()) > 1:
 374             # rdf before style master
 375             main_text = wldoc.edoc.getroot()[1]
 376         else:
 377             # rdf in style master
 378             main_text = wldoc.edoc.getroot()[0]
 379             if main_text.tag == RDFNS('RDF'):
 380                 main_text = None
 381
 382         if main_text is not None:
 383             for chunk_xml in chop(main_text):
 384                 empty = False
 385                 if sample is not None:
 386                     if sample <= 0:
 387                         empty = True
 388                     else:
 389                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 390                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 391
 392                 toc.extend(chunk_toc)
 393                 chars = chars.union(chunk_chars)
 394                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 395                 add_to_manifest(manifest, chunk_counter)
 396                 add_to_spine(spine, chunk_counter)
 397                 chunk_counter += 1
 398
 399         for child in wldoc.parts():
 400             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 401                 child, chunk_counter, first=False, sample=sample)
 402             toc.append(child_toc)
 403             chars = chars.union(chunk_chars)
 404
 405         return toc, chunk_counter, chars, sample
 406
 407
 408     document = deepcopy(wldoc)
 409     del wldoc
 410
 411     if flags:
 412         for flag in flags:
 413             document.edoc.getroot().set(flag, 'yes')
 414
 415     # add editors info
 416     document.edoc.getroot().set('editors', u', '.join(sorted(
 417         editor.readable() for editor in document.editors())))
 418
 419     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 420     manifest = opf.find('.//' + OPFNS('manifest'))
 421     guide = opf.find('.//' + OPFNS('guide'))
 422     spine = opf.find('.//' + OPFNS('spine'))
 423
 424     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 425     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 426
 427     # write static elements
 428     mime = zipfile.ZipInfo()
 429     mime.filename = 'mimetype'
 430     mime.compress_type = zipfile.ZIP_STORED
 431     mime.extra = ''
 432     zip.writestr(mime, 'application/epub+zip')
 433     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 434                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 435                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 436                        'media-type="application/oebps-package+xml" />' \
 437                        '</rootfiles></container>')
 438     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 439     zip.write(get_resource('res/logo.png'), os.path.join('OPS', 'logo.png'))
 440     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 441     if not style:
 442         style = get_resource('epub/style.css')
 443     zip.write(style, os.path.join('OPS', 'style.css'))
 444     if resources:
 445         if os.path.isdir(resources):
 446             for dp, dirs, files in os.walk(resources):
 447                 for fname in files:
 448                     fpath  = os.path.join(dp, fname)
 449                     if os.path.isfile(fpath):
 450                         zip.write(fpath, os.path.join('OPS', fname))
 451         else:
 452             print "resources path %s is not directory" % resources
 453
 454
 455     if cover:
 456         if cover is True:
 457             cover = FutureOfCopyrightCover
 458
 459         cover_file = StringIO()
 460         c = cover(document.book_info)
 461         c.save(cover_file)
 462         c_name = 'cover.%s' % c.ext()
 463         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 464         del cover_file
 465
 466         cover_tree = etree.parse(get_resource('epub/cover.html'))
 467         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 468         zip.writestr('OPS/cover.html', etree.tostring(
 469                         cover_tree, method="html", pretty_print=True))
 470
 471         if c.uses_dc_cover:
 472             if document.book_info.cover_by:
 473                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 474             if document.book_info.cover_source:
 475                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 476
 477         manifest.append(etree.fromstring(
 478             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 479         manifest.append(etree.fromstring(
 480             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 481         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 482         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 483         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 484
 485
 486
 487     annotations = etree.Element('annotations')
 488
 489     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 490                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 491                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 492                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 493                                '</navMap></ncx>')
 494     nav_map = toc_file[-1]
 495
 496     manifest.append(etree.fromstring(
 497         '<item id="intro" href="intro.html" media-type="application/xhtml+xml" />'))
 498     spine.append(etree.fromstring(
 499         '<itemref idref="intro" />'))
 500     zip.writestr('OPS/intro.html', open(get_resource('epub/intro.html')).read())
 501
 502
 503     if html_toc:
 504         manifest.append(etree.fromstring(
 505             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 506         spine.append(etree.fromstring(
 507             '<itemref idref="html_toc" />'))
 508         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Table of Contents"/>'))
 509
 510     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 511
 512     if len(toc.children) < 2:
 513         toc.add(u"Beginning of the book", "part1.html")
 514
 515     # Last modifications in container files and EPUB creation
 516     if len(annotations) > 0:
 517         toc.add("Footnotes", "annotations.html")
 518         manifest.append(etree.fromstring(
 519             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 520         spine.append(etree.fromstring(
 521             '<itemref idref="annotations" />'))
 522         replace_by_verse(annotations)
 523         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 524         chars = chars.union(used_chars(html_tree.getroot()))
 525         zip.writestr('OPS/annotations.html', etree.tostring(
 526                             html_tree, method="html", pretty_print=True))
 527
 528     # toc.add("Weprzyj Wolne Lektury", "support.html")
 529     # manifest.append(etree.fromstring(
 530     #     '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
 531     # spine.append(etree.fromstring(
 532     #     '<itemref idref="support" />'))
 533     # html_string = open(get_resource('epub/support.html')).read()
 534     # chars.update(used_chars(etree.fromstring(html_string)))
 535     # zip.writestr('OPS/support.html', html_string)
 536
 537     toc.add("Editors", "last.html")
 538     manifest.append(etree.fromstring(
 539         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 540     spine.append(etree.fromstring(
 541         '<itemref idref="last" />'))
 542     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 543     chars.update(used_chars(html_tree.getroot()))
 544     zip.writestr('OPS/last.html', etree.tostring(
 545                         html_tree, method="html", pretty_print=True))
 546
 547     if not flags or not 'without-fonts' in flags:
 548         # strip fonts
 549         tmpdir = mkdtemp('-librarian-epub')
 550         try:
 551             cwd = os.getcwd()
 552         except OSError:
 553             cwd = None
 554
 555         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 556         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 557             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 558                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 559             if verbose:
 560                 print "Running font-optimizer"
 561                 subprocess.check_call(optimizer_call)
 562             else:
 563                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 564             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 565             manifest.append(etree.fromstring(
 566                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 567         rmtree(tmpdir)
 568         if cwd is not None:
 569             os.chdir(cwd)
 570
 571     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 572     title = document.book_info.title
 573     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 574     for st in attributes:
 575         meta = toc_file.makeelement(NCXNS('meta'))
 576         meta.set('name', st)
 577         meta.set('content', '0')
 578         toc_file[0].append(meta)
 579     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 580     toc_file[0][1].set('content', str(toc.depth()))
 581     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 582
 583     # write TOC
 584     if html_toc:
 585         toc.add(u"Table of Contents", "toc.html", index=1)
 586         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 587     toc.write_to_xml(nav_map)
 588     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 589     zip.close()
 590
 591     return OutputFile.from_filename(output_file.name)