librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import re
  11 import subprocess
  12 from StringIO import StringIO
  13 from copy import deepcopy
  14 from lxml import etree
  15 import zipfile
  16 from tempfile import mkdtemp, NamedTemporaryFile
  17 from shutil import rmtree
  18
  19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, IOFile
  20
  21 from librarian import functions, get_resource
  22
  23 functions.reg_person_name()
  24
  25
  26 def inner_xml(node):
  27     """ returns node's text and children as a string
  28
  29     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  30     x<b>y</b>z
  31     """
  32
  33     nt = node.text if node.text is not None else ''
  34     return ''.join([nt] + [etree.tostring(child) for child in node])
  35
  36 def set_inner_xml(node, text):
  37     """ sets node's text and children from a string
  38
  39     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  40     >>> set_inner_xml(e, 'x<b>y</b>z')
  41     >>> print etree.tostring(e)
  42     <a>x<b>y</b>z</a>
  43     """
  44
  45     p = etree.fromstring('<x>%s</x>' % text)
  46     node.text = p.text
  47     node[:] = p[:]
  48
  49
  50 def node_name(node):
  51     """ Find out a node's name
  52
  53     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  54     XYZ
  55     """
  56
  57     tempnode = deepcopy(node)
  58
  59     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  60         for e in tempnode.findall('.//%s' % p):
  61             t = e.tail
  62             e.clear()
  63             e.tail = t
  64     etree.strip_tags(tempnode, '*')
  65     return tempnode.text
  66
  67
  68 def xslt(xml, sheet):
  69     if isinstance(xml, etree._Element):
  70         xml = etree.ElementTree(xml)
  71     with open(sheet) as xsltf:
  72         return xml.xslt(etree.parse(xsltf))
  73
  74
  75 def replace_characters(node):
  76     def replace_chars(text):
  77         if text is None:
  78             return None
  79         return text.replace(u"\ufeff", u"")\
  80                    .replace("---", u"\u2014")\
  81                    .replace("--", u"\u2013")\
  82                    .replace(",,", u"\u201E")\
  83                    .replace('"', u"\u201D")\
  84                    .replace("'", u"\u2019")
  85     if node.tag in ('uwaga', 'extra'):
  86         t = node.tail
  87         node.clear()
  88         node.tail = t
  89     node.text = replace_chars(node.text)
  90     node.tail = replace_chars(node.tail)
  91     for child in node:
  92         replace_characters(child)
  93
  94
  95 def find_annotations(annotations, source, part_no):
  96     for child in source:
  97         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  98             annotation = deepcopy(child)
  99             number = str(len(annotations)+1)
 100             annotation.set('number', number)
 101             annotation.set('part', str(part_no))
 102             annotation.tail = ''
 103             annotations.append(annotation)
 104             tail = child.tail
 105             child.clear()
 106             child.tail = tail
 107             child.text = number
 108         if child.tag not in ('extra', 'uwaga'):
 109             find_annotations(annotations, child, part_no)
 110
 111
 112 class Stanza(object):
 113     """
 114     Converts / verse endings into verse elements in a stanza.
 115
 116     Slashes may only occur directly in the stanza. Any slashes in subelements
 117     will be ignored, and the subelements will be put inside verse elements.
 118
 119     >>> s = etree.fromstring("<strofa>a/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
 120     >>> Stanza(s).versify()
 121     >>> print etree.tostring(s)
 122     <strofa><wers_normalny>a</wers_normalny><wers_normalny>b<x>x/
 123     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 124
 125     """
 126     def __init__(self, stanza_elem):
 127         self.stanza = stanza_elem
 128         self.verses = []
 129         self.open_verse = None
 130
 131     def versify(self):
 132         self.push_text(self.stanza.text)
 133         for elem in self.stanza:
 134             self.push_elem(elem)
 135             self.push_text(elem.tail)
 136         tail = self.stanza.tail
 137         self.stanza.clear()
 138         self.stanza.tail = tail
 139         self.stanza.extend(self.verses)
 140
 141     def open_normal_verse(self):
 142         self.open_verse = self.stanza.makeelement("wers_normalny")
 143         self.verses.append(self.open_verse)
 144
 145     def get_open_verse(self):
 146         if self.open_verse is None:
 147             self.open_normal_verse()
 148         return self.open_verse
 149
 150     def push_text(self, text):
 151         if not text or not text.strip():
 152             return
 153         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 154             if i:
 155                 self.open_normal_verse()
 156             verse = self.get_open_verse()
 157             if len(verse):
 158                 verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
 159             else:
 160                 verse.text = (verse.text or "") + verse_text.strip()
 161
 162     def push_elem(self, elem):
 163         if elem.tag.startswith("wers"):
 164             verse = deepcopy(elem)
 165             verse.tail = None
 166             self.verses.append(verse)
 167             self.open_verse = verse
 168         else:
 169             appended = deepcopy(elem)
 170             appended.tail = None
 171             self.get_open_verse().append(appended)
 172
 173
 174 def replace_by_verse(tree):
 175     """ Find stanzas and create new verses in place of a '/' character """
 176
 177     stanzas = tree.findall('.//' + WLNS('strofa'))
 178     for stanza in stanzas:
 179         Stanza(stanza).versify()
 180
 181
 182 def add_to_manifest(manifest, partno):
 183     """ Adds a node to the manifest section in content.opf file """
 184
 185     partstr = 'part%d' % partno
 186     e = manifest.makeelement(OPFNS('item'), attrib={
 187                                  'id': partstr,
 188                                  'href': partstr + '.html',
 189                                  'media-type': 'application/xhtml+xml',
 190                              })
 191     manifest.append(e)
 192
 193
 194 def add_to_spine(spine, partno):
 195     """ Adds a node to the spine section in content.opf file """
 196
 197     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 198     spine.append(e)
 199
 200
 201 class TOC(object):
 202     def __init__(self, name=None, part_href=None):
 203         self.children = []
 204         self.name = name
 205         self.part_href = part_href
 206         self.sub_number = None
 207
 208     def add(self, name, part_href, level=0, is_part=True, index=None):
 209         assert level == 0 or index is None
 210         if level > 0 and self.children:
 211             return self.children[-1].add(name, part_href, level-1, is_part)
 212         else:
 213             t = TOC(name)
 214             t.part_href = part_href
 215             if index is not None:
 216                 self.children.insert(index, t)
 217             else:
 218                 self.children.append(t)
 219             if not is_part:
 220                 t.sub_number = len(self.children) + 1
 221                 return t.sub_number
 222
 223     def append(self, toc):
 224         self.children.append(toc)
 225
 226     def extend(self, toc):
 227         self.children.extend(toc.children)
 228
 229     def depth(self):
 230         if self.children:
 231             return max((c.depth() for c in self.children)) + 1
 232         else:
 233             return 0
 234
 235     def href(self):
 236         src = self.part_href
 237         if self.sub_number is not None:
 238             src += '#sub%d' % self.sub_number
 239         return src
 240
 241     def write_to_xml(self, nav_map, counter=1):
 242         for child in self.children:
 243             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 244             nav_point.set('id', 'NavPoint-%d' % counter)
 245             nav_point.set('playOrder', str(counter))
 246
 247             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 248             text = nav_map.makeelement(NCXNS('text'))
 249             text.text = child.name
 250             nav_label.append(text)
 251             nav_point.append(nav_label)
 252
 253             content = nav_map.makeelement(NCXNS('content'))
 254             content.set('src', child.href())
 255             nav_point.append(content)
 256             nav_map.append(nav_point)
 257             counter = child.write_to_xml(nav_point, counter + 1)
 258         return counter
 259
 260     def html_part(self, depth=0):
 261         texts = []
 262         for child in self.children:
 263             texts.append(
 264                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 265                 (depth, child.href(), child.name))
 266             texts.append(child.html_part(depth+1))
 267         return "\n".join(texts)
 268
 269     def html(self):
 270         with open(get_resource('epub/toc.html')) as f:
 271             t = unicode(f.read(), 'utf-8')
 272         return t % self.html_part()
 273
 274
 275 def used_chars(element):
 276     """ Lists characters used in an ETree Element """
 277     chars = set((element.text or '') + (element.tail or ''))
 278     for child in element:
 279         chars = chars.union(used_chars(child))
 280     return chars
 281
 282
 283 def chop(main_text):
 284     """ divide main content of the XML file into chunks """
 285
 286     # prepare a container for each chunk
 287     part_xml = etree.Element('utwor')
 288     etree.SubElement(part_xml, 'master')
 289     main_xml_part = part_xml[0] # master
 290
 291     last_node_part = False
 292     for one_part in main_text:
 293         name = one_part.tag
 294         if name == 'naglowek_czesc':
 295             yield part_xml
 296             last_node_part = True
 297             main_xml_part[:] = [deepcopy(one_part)]
 298         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 299             yield part_xml
 300             main_xml_part[:] = [deepcopy(one_part)]
 301         else:
 302             main_xml_part.append(deepcopy(one_part))
 303             last_node_part = False
 304     yield part_xml
 305
 306
 307 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 308     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 309
 310     toc = TOC()
 311     for element in chunk_xml[0]:
 312         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 313             toc.add(node_name(element), "part%d.html" % chunk_no)
 314         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 315             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 316             element.set('sub', str(subnumber))
 317     if empty:
 318         if not _empty_html_static:
 319             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 320         chars = set()
 321         output_html = _empty_html_static[0]
 322     else:
 323         find_annotations(annotations, chunk_xml, chunk_no)
 324         replace_by_verse(chunk_xml)
 325         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 326         chars = used_chars(html_tree.getroot())
 327         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 328     return output_html, toc, chars
 329
 330
 331 def transform(wldoc, verbose=False,
 332               style=None, html_toc=False,
 333               sample=None, cover=None, flags=None):
 334     """ produces a EPUB file
 335
 336     sample=n: generate sample e-book (with at least n paragraphs)
 337     cover: a cover.Cover factory or True for default
 338     flags: less-advertising, without-fonts, working-copy
 339     """
 340
 341     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 342         """ processes one input file and proceeds to its children """
 343
 344         replace_characters(wldoc.edoc.getroot())
 345
 346         # every input file will have a TOC entry,
 347         # pointing to starting chunk
 348         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 349         chars = set()
 350         if first:
 351             # write book title page
 352             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 353             chars = used_chars(html_tree.getroot())
 354             zip.writestr('OPS/title.html',
 355                  etree.tostring(html_tree, method="html", pretty_print=True))
 356             # add a title page TOC entry
 357             toc.add(u"Strona tytułowa", "title.html")
 358         elif wldoc.book_info.parts:
 359             # write title page for every parent
 360             if sample is not None and sample <= 0:
 361                 chars = set()
 362                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 363             else:
 364                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 365                 chars = used_chars(html_tree.getroot())
 366                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 367             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 368             add_to_manifest(manifest, chunk_counter)
 369             add_to_spine(spine, chunk_counter)
 370             chunk_counter += 1
 371
 372         if len(wldoc.edoc.getroot()) > 1:
 373             # rdf before style master
 374             main_text = wldoc.edoc.getroot()[1]
 375         else:
 376             # rdf in style master
 377             main_text = wldoc.edoc.getroot()[0]
 378             if main_text.tag == RDFNS('RDF'):
 379                 main_text = None
 380
 381         if main_text is not None:
 382             for chunk_xml in chop(main_text):
 383                 empty = False
 384                 if sample is not None:
 385                     if sample <= 0:
 386                         empty = True
 387                     else:
 388                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 389                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 390
 391                 toc.extend(chunk_toc)
 392                 chars = chars.union(chunk_chars)
 393                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 394                 add_to_manifest(manifest, chunk_counter)
 395                 add_to_spine(spine, chunk_counter)
 396                 chunk_counter += 1
 397
 398         # for child in wldoc.parts():
 399         #     child_toc, chunk_counter, chunk_chars, sample = transform_file(
 400         #         child, chunk_counter, first=False, sample=sample)
 401         #     toc.append(child_toc)
 402         #     chars = chars.union(chunk_chars)
 403
 404         return toc, chunk_counter, chars, sample
 405
 406
 407     document = deepcopy(wldoc)
 408     del wldoc
 409
 410     if flags:
 411         for flag in flags:
 412             document.edoc.getroot().set(flag, 'yes')
 413
 414     # add editors info
 415     # document.edoc.getroot().set('editors', u', '.join(sorted(
 416     #     editor.readable() for editor in document.editors())))
 417
 418     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 419     manifest = opf.find('.//' + OPFNS('manifest'))
 420     guide = opf.find('.//' + OPFNS('guide'))
 421     spine = opf.find('.//' + OPFNS('spine'))
 422
 423     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 424     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 425
 426     # write static elements
 427     mime = zipfile.ZipInfo()
 428     mime.filename = 'mimetype'
 429     mime.compress_type = zipfile.ZIP_STORED
 430     mime.extra = ''
 431     zip.writestr(mime, 'application/epub+zip')
 432     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 433                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 434                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 435                        'media-type="application/oebps-package+xml" />' \
 436                        '</rootfiles></container>')
 437     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 438     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 439     if not style:
 440         style = get_resource('epub/style.css')
 441     zip.write(style, os.path.join('OPS', 'style.css'))
 442
 443     if cover:
 444         cover_file = StringIO()
 445         bound_cover = cover(document.book_info)
 446         bound_cover.save(cover_file)
 447         cover_name = 'cover.%s' % bound_cover.ext()
 448         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
 449         del cover_file
 450
 451         cover_tree = etree.parse(get_resource('epub/cover.html'))
 452         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
 453         zip.writestr('OPS/cover.html', etree.tostring(
 454                         cover_tree, method="html", pretty_print=True))
 455
 456         if bound_cover.uses_dc_cover:
 457             if document.book_info.cover_by:
 458                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 459             if document.book_info.cover_source:
 460                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 461
 462         manifest.append(etree.fromstring(
 463             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 464         manifest.append(etree.fromstring(
 465             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
 466         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 467         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 468         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 469
 470
 471     annotations = etree.Element('annotations')
 472
 473     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 474                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 475                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 476                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 477                                '</navMap></ncx>')
 478     nav_map = toc_file[-1]
 479
 480     if html_toc:
 481         manifest.append(etree.fromstring(
 482             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 483         spine.append(etree.fromstring(
 484             '<itemref idref="html_toc" />'))
 485         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
 486
 487     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 488
 489     if len(toc.children) < 2:
 490         toc.add(u"Początek utworu", "part1.html")
 491
 492     # Last modifications in container files and EPUB creation
 493     if len(annotations) > 0:
 494         toc.add("Przypisy", "annotations.html")
 495         manifest.append(etree.fromstring(
 496             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 497         spine.append(etree.fromstring(
 498             '<itemref idref="annotations" />'))
 499         replace_by_verse(annotations)
 500         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 501         chars = chars.union(used_chars(html_tree.getroot()))
 502         zip.writestr('OPS/annotations.html', etree.tostring(
 503                             html_tree, method="html", pretty_print=True))
 504
 505     toc.add("Strona redakcyjna", "last.html")
 506     manifest.append(etree.fromstring(
 507         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 508     spine.append(etree.fromstring(
 509         '<itemref idref="last" />'))
 510     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 511     chars.update(used_chars(html_tree.getroot()))
 512     zip.writestr('OPS/last.html', etree.tostring(
 513                         html_tree, method="html", pretty_print=True))
 514
 515     if not flags or not 'without-fonts' in flags:
 516         # strip fonts
 517         tmpdir = mkdtemp('-librarian-epub')
 518         try:
 519             cwd = os.getcwd()
 520         except OSError:
 521             cwd = None
 522
 523         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 524         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 525             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 526                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 527             if verbose:
 528                 print "Running font-optimizer"
 529                 subprocess.check_call(optimizer_call)
 530             else:
 531                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 532             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 533             manifest.append(etree.fromstring(
 534                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 535         rmtree(tmpdir)
 536         if cwd is not None:
 537             os.chdir(cwd)
 538
 539     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 540     title = document.book_info.title
 541     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 542     for st in attributes:
 543         meta = toc_file.makeelement(NCXNS('meta'))
 544         meta.set('name', st)
 545         meta.set('content', '0')
 546         toc_file[0].append(meta)
 547     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 548     toc_file[0][1].set('content', str(toc.depth()))
 549     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 550
 551     # write TOC
 552     if html_toc:
 553         toc.add(u"Spis treści", "toc.html", index=1)
 554         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 555     toc.write_to_xml(nav_map)
 556     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 557     zip.close()
 558
 559     return IOFile.from_filename(output_file.name)