librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import re
  11 import subprocess
  12 from StringIO import StringIO
  13 from copy import deepcopy
  14 from mimetypes import guess_type
  15
  16 from lxml import etree
  17 import zipfile
  18 from tempfile import mkdtemp, NamedTemporaryFile
  19 from shutil import rmtree
  20
  21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
  22 from librarian.cover import WLCover
  23
  24 from librarian import functions, get_resource
  25
  26 functions.reg_person_name()
  27
  28
  29 def inner_xml(node):
  30     """ returns node's text and children as a string
  31
  32     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  33     x<b>y</b>z
  34     """
  35
  36     nt = node.text if node.text is not None else ''
  37     return ''.join([nt] + [etree.tostring(child) for child in node])
  38
  39 def set_inner_xml(node, text):
  40     """ sets node's text and children from a string
  41
  42     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  43     >>> set_inner_xml(e, 'x<b>y</b>z')
  44     >>> print etree.tostring(e)
  45     <a>x<b>y</b>z</a>
  46     """
  47
  48     p = etree.fromstring('<x>%s</x>' % text)
  49     node.text = p.text
  50     node[:] = p[:]
  51
  52
  53 def node_name(node):
  54     """ Find out a node's name
  55
  56     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  57     XYZ
  58     """
  59
  60     tempnode = deepcopy(node)
  61
  62     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  63         for e in tempnode.findall('.//%s' % p):
  64             t = e.tail
  65             e.clear()
  66             e.tail = t
  67     etree.strip_tags(tempnode, '*')
  68     return tempnode.text
  69
  70
  71 def xslt(xml, sheet):
  72     if isinstance(xml, etree._Element):
  73         xml = etree.ElementTree(xml)
  74     with open(sheet) as xsltf:
  75         return xml.xslt(etree.parse(xsltf))
  76
  77
  78 def replace_characters(node):
  79     def replace_chars(text):
  80         if text is None:
  81             return None
  82         return text.replace(u"\ufeff", u"")\
  83                    .replace("---", u"\u2014")\
  84                    .replace("--", u"\u2013")\
  85                    .replace(",,", u"\u201E")\
  86                    .replace('"', u"\u201D")\
  87                    .replace("'", u"\u2019")
  88     if node.tag in ('uwaga', 'extra'):
  89         t = node.tail
  90         node.clear()
  91         node.tail = t
  92     node.text = replace_chars(node.text)
  93     node.tail = replace_chars(node.tail)
  94     for child in node:
  95         replace_characters(child)
  96
  97
  98 def find_annotations(annotations, source, part_no):
  99     for child in source:
 100         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 101             annotation = deepcopy(child)
 102             number = str(len(annotations)+1)
 103             annotation.set('number', number)
 104             annotation.set('part', str(part_no))
 105             annotation.tail = ''
 106             annotations.append(annotation)
 107             tail = child.tail
 108             child.clear()
 109             child.tail = tail
 110             child.text = number
 111         if child.tag not in ('extra', 'uwaga'):
 112             find_annotations(annotations, child, part_no)
 113
 114
 115 class Stanza(object):
 116     """
 117     Converts / verse endings into verse elements in a stanza.
 118
 119     Slashes may only occur directly in the stanza. Any slashes in subelements
 120     will be ignored, and the subelements will be put inside verse elements.
 121
 122     >>> s = etree.fromstring("<strofa>a/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
 123     >>> Stanza(s).versify()
 124     >>> print etree.tostring(s)
 125     <strofa><wers_normalny>a</wers_normalny><wers_normalny>b<x>x/
 126     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 127
 128     """
 129     def __init__(self, stanza_elem):
 130         self.stanza = stanza_elem
 131         self.verses = []
 132         self.open_verse = None
 133
 134     def versify(self):
 135         self.push_text(self.stanza.text)
 136         for elem in self.stanza:
 137             self.push_elem(elem)
 138             self.push_text(elem.tail)
 139         tail = self.stanza.tail
 140         self.stanza.clear()
 141         self.stanza.tail = tail
 142         self.stanza.extend(self.verses)
 143
 144     def open_normal_verse(self):
 145         self.open_verse = self.stanza.makeelement("wers_normalny")
 146         self.verses.append(self.open_verse)
 147
 148     def get_open_verse(self):
 149         if self.open_verse is None:
 150             self.open_normal_verse()
 151         return self.open_verse
 152
 153     def push_text(self, text):
 154         if not text or not text.strip():
 155             return
 156         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 157             if i:
 158                 self.open_normal_verse()
 159             verse = self.get_open_verse()
 160             if len(verse):
 161                 verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
 162             else:
 163                 verse.text = (verse.text or "") + verse_text.strip()
 164
 165     def push_elem(self, elem):
 166         if elem.tag.startswith("wers"):
 167             verse = deepcopy(elem)
 168             verse.tail = None
 169             self.verses.append(verse)
 170             self.open_verse = verse
 171         else:
 172             appended = deepcopy(elem)
 173             appended.tail = None
 174             self.get_open_verse().append(appended)
 175
 176
 177 def replace_by_verse(tree):
 178     """ Find stanzas and create new verses in place of a '/' character """
 179
 180     stanzas = tree.findall('.//' + WLNS('strofa'))
 181     for stanza in stanzas:
 182         Stanza(stanza).versify()
 183
 184
 185 def add_to_manifest(manifest, partno):
 186     """ Adds a node to the manifest section in content.opf file """
 187
 188     partstr = 'part%d' % partno
 189     e = manifest.makeelement(OPFNS('item'), attrib={
 190                                  'id': partstr,
 191                                  'href': partstr + '.html',
 192                                  'media-type': 'application/xhtml+xml',
 193                              })
 194     manifest.append(e)
 195
 196
 197 def add_to_spine(spine, partno):
 198     """ Adds a node to the spine section in content.opf file """
 199
 200     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 201     spine.append(e)
 202
 203
 204 class TOC(object):
 205     def __init__(self, name=None, part_href=None):
 206         self.children = []
 207         self.name = name
 208         self.part_href = part_href
 209         self.sub_number = None
 210
 211     def add(self, name, part_href, level=0, is_part=True, index=None):
 212         assert level == 0 or index is None
 213         if level > 0 and self.children:
 214             return self.children[-1].add(name, part_href, level-1, is_part)
 215         else:
 216             t = TOC(name)
 217             t.part_href = part_href
 218             if index is not None:
 219                 self.children.insert(index, t)
 220             else:
 221                 self.children.append(t)
 222             if not is_part:
 223                 t.sub_number = len(self.children) + 1
 224                 return t.sub_number
 225
 226     def append(self, toc):
 227         self.children.append(toc)
 228
 229     def extend(self, toc):
 230         self.children.extend(toc.children)
 231
 232     def depth(self):
 233         if self.children:
 234             return max((c.depth() for c in self.children)) + 1
 235         else:
 236             return 0
 237
 238     def href(self):
 239         src = self.part_href
 240         if self.sub_number is not None:
 241             src += '#sub%d' % self.sub_number
 242         return src
 243
 244     def write_to_xml(self, nav_map, counter=1):
 245         for child in self.children:
 246             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 247             nav_point.set('id', 'NavPoint-%d' % counter)
 248             nav_point.set('playOrder', str(counter))
 249
 250             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 251             text = nav_map.makeelement(NCXNS('text'))
 252             text.text = child.name
 253             nav_label.append(text)
 254             nav_point.append(nav_label)
 255
 256             content = nav_map.makeelement(NCXNS('content'))
 257             content.set('src', child.href())
 258             nav_point.append(content)
 259             nav_map.append(nav_point)
 260             counter = child.write_to_xml(nav_point, counter + 1)
 261         return counter
 262
 263     def html_part(self, depth=0):
 264         texts = []
 265         for child in self.children:
 266             texts.append(
 267                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 268                 (depth, child.href(), child.name))
 269             texts.append(child.html_part(depth+1))
 270         return "\n".join(texts)
 271
 272     def html(self):
 273         with open(get_resource('epub/toc.html')) as f:
 274             t = unicode(f.read(), 'utf-8')
 275         return t % self.html_part()
 276
 277
 278 def used_chars(element):
 279     """ Lists characters used in an ETree Element """
 280     chars = set((element.text or '') + (element.tail or ''))
 281     for child in element:
 282         chars = chars.union(used_chars(child))
 283     return chars
 284
 285
 286 def chop(main_text):
 287     """ divide main content of the XML file into chunks """
 288
 289     # prepare a container for each chunk
 290     part_xml = etree.Element('utwor')
 291     etree.SubElement(part_xml, 'master')
 292     main_xml_part = part_xml[0] # master
 293
 294     last_node_part = False
 295     for one_part in main_text:
 296         name = one_part.tag
 297         if name == 'naglowek_czesc':
 298             yield part_xml
 299             last_node_part = True
 300             main_xml_part[:] = [deepcopy(one_part)]
 301         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 302             yield part_xml
 303             main_xml_part[:] = [deepcopy(one_part)]
 304         else:
 305             main_xml_part.append(deepcopy(one_part))
 306             last_node_part = False
 307     yield part_xml
 308
 309
 310 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 311     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 312
 313     toc = TOC()
 314     for element in chunk_xml[0]:
 315         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 316             toc.add(node_name(element), "part%d.html" % chunk_no)
 317         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 318             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 319             element.set('sub', str(subnumber))
 320     if empty:
 321         if not _empty_html_static:
 322             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 323         chars = set()
 324         output_html = _empty_html_static[0]
 325     else:
 326         find_annotations(annotations, chunk_xml, chunk_no)
 327         replace_by_verse(chunk_xml)
 328         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 329         chars = used_chars(html_tree.getroot())
 330         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 331     return output_html, toc, chars
 332
 333
 334 def transform(wldoc, verbose=False, style=None, html_toc=False,
 335               sample=None, cover=None, flags=None, ilustr_path=''):
 336     """ produces a EPUB file
 337
 338     sample=n: generate sample e-book (with at least n paragraphs)
 339     cover: a cover.Cover factory or True for default
 340     flags: less-advertising, without-fonts, working-copy
 341     """
 342
 343     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 344         """ processes one input file and proceeds to its children """
 345
 346         replace_characters(wldoc.edoc.getroot())
 347
 348         # every input file will have a TOC entry,
 349         # pointing to starting chunk
 350         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 351         chars = set()
 352         if first:
 353             # write book title page
 354             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 355             chars = used_chars(html_tree.getroot())
 356             zip.writestr('OPS/title.html',
 357                  etree.tostring(html_tree, method="html", pretty_print=True))
 358             # add a title page TOC entry
 359             toc.add(u"Strona tytułowa", "title.html")
 360         elif wldoc.book_info.parts:
 361             # write title page for every parent
 362             if sample is not None and sample <= 0:
 363                 chars = set()
 364                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 365             else:
 366                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 367                 chars = used_chars(html_tree.getroot())
 368                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 369             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 370             add_to_manifest(manifest, chunk_counter)
 371             add_to_spine(spine, chunk_counter)
 372             chunk_counter += 1
 373
 374         if len(wldoc.edoc.getroot()) > 1:
 375             # rdf before style master
 376             main_text = wldoc.edoc.getroot()[1]
 377         else:
 378             # rdf in style master
 379             main_text = wldoc.edoc.getroot()[0]
 380             if main_text.tag == RDFNS('RDF'):
 381                 main_text = None
 382
 383         if main_text is not None:
 384             for chunk_xml in chop(main_text):
 385                 empty = False
 386                 if sample is not None:
 387                     if sample <= 0:
 388                         empty = True
 389                     else:
 390                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 391                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 392
 393                 toc.extend(chunk_toc)
 394                 chars = chars.union(chunk_chars)
 395                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 396                 add_to_manifest(manifest, chunk_counter)
 397                 add_to_spine(spine, chunk_counter)
 398                 chunk_counter += 1
 399
 400         for child in wldoc.parts():
 401             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 402                 child, chunk_counter, first=False, sample=sample)
 403             toc.append(child_toc)
 404             chars = chars.union(chunk_chars)
 405
 406         return toc, chunk_counter, chars, sample
 407
 408
 409     document = deepcopy(wldoc)
 410     del wldoc
 411
 412     if flags:
 413         for flag in flags:
 414             document.edoc.getroot().set(flag, 'yes')
 415
 416     # add editors info
 417     document.edoc.getroot().set('editors', u', '.join(sorted(
 418         editor.readable() for editor in document.editors())))
 419
 420     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 421     manifest = opf.find('.//' + OPFNS('manifest'))
 422     guide = opf.find('.//' + OPFNS('guide'))
 423     spine = opf.find('.//' + OPFNS('spine'))
 424
 425     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 426     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 427
 428     if os.path.isdir(ilustr_path):
 429         for i, filename in enumerate(os.listdir(ilustr_path)):
 430             file_path = os.path.join(ilustr_path, filename)
 431             zip.write(file_path, os.path.join('OPS', filename))
 432             image_id = 'image%s' % i
 433             manifest.append(etree.fromstring(
 434                 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
 435
 436     # write static elements
 437     mime = zipfile.ZipInfo()
 438     mime.filename = 'mimetype'
 439     mime.compress_type = zipfile.ZIP_STORED
 440     mime.extra = ''
 441     zip.writestr(mime, 'application/epub+zip')
 442     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 443                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 444                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 445                        'media-type="application/oebps-package+xml" />' \
 446                        '</rootfiles></container>')
 447     #zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 448     zip.write(get_resource('res/koedlogo.png'), os.path.join('OPS', 'logo_koed.png'))
 449     #zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 450     if not style:
 451         style = get_resource('epub/style.css')
 452     zip.write(style, os.path.join('OPS', 'style.css'))
 453
 454     if cover is None:
 455         cover = WLCover
 456     if cover:
 457         if cover is True:
 458             cover = WLCover
 459
 460         cover_file = StringIO()
 461         bound_cover = cover(document.book_info)
 462         bound_cover.save(cover_file)
 463         cover_name = 'cover.%s' % bound_cover.ext()
 464         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
 465         del cover_file
 466
 467         cover_tree = etree.parse(get_resource('epub/cover.html'))
 468         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
 469         zip.writestr('OPS/cover.html', etree.tostring(
 470                         cover_tree, method="html", pretty_print=True))
 471
 472         if bound_cover.uses_dc_cover:
 473             if document.book_info.cover_by:
 474                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 475             if document.book_info.cover_source:
 476                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 477
 478         manifest.append(etree.fromstring(
 479             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 480         manifest.append(etree.fromstring(
 481             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
 482         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 483         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 484         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 485
 486
 487     annotations = etree.Element('annotations')
 488
 489     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 490                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 491                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 492                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 493                                '</navMap></ncx>')
 494     nav_map = toc_file[-1]
 495
 496     if html_toc:
 497         manifest.append(etree.fromstring(
 498             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 499         spine.append(etree.fromstring(
 500             '<itemref idref="html_toc" />'))
 501         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
 502
 503     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 504
 505     if len(toc.children) < 2:
 506         toc.add(u"Początek utworu", "part1.html")
 507
 508     # Last modifications in container files and EPUB creation
 509     if len(annotations) > 0:
 510         toc.add("Przypisy", "annotations.html")
 511         manifest.append(etree.fromstring(
 512             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 513         spine.append(etree.fromstring(
 514             '<itemref idref="annotations" />'))
 515         replace_by_verse(annotations)
 516         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 517         chars = chars.union(used_chars(html_tree.getroot()))
 518         zip.writestr('OPS/annotations.html', etree.tostring(
 519                             html_tree, method="html", pretty_print=True))
 520
 521     toc.add("Strona redakcyjna", "last.html")
 522     manifest.append(etree.fromstring(
 523         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 524     spine.append(etree.fromstring(
 525         '<itemref idref="last" />'))
 526     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 527     chars.update(used_chars(html_tree.getroot()))
 528     zip.writestr('OPS/last.html', etree.tostring(
 529                         html_tree, method="html", pretty_print=True))
 530
 531     if not flags or not 'without-fonts' in flags:
 532         # strip fonts
 533         tmpdir = mkdtemp('-librarian-epub')
 534         try:
 535             cwd = os.getcwd()
 536         except OSError:
 537             cwd = None
 538
 539         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 540         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 541             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 542                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 543             if verbose:
 544                 print "Running font-optimizer"
 545                 subprocess.check_call(optimizer_call)
 546             else:
 547                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 548             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 549             manifest.append(etree.fromstring(
 550                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 551         rmtree(tmpdir)
 552         if cwd is not None:
 553             os.chdir(cwd)
 554
 555     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 556     title = document.book_info.title
 557     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 558     for st in attributes:
 559         meta = toc_file.makeelement(NCXNS('meta'))
 560         meta.set('name', st)
 561         meta.set('content', '0')
 562         toc_file[0].append(meta)
 563     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 564     toc_file[0][1].set('content', str(toc.depth()))
 565     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 566
 567     # write TOC
 568     if html_toc:
 569         toc.add(u"Spis treści", "toc.html", index=1)
 570         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 571     toc.write_to_xml(nav_map)
 572     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 573     zip.close()
 574
 575     return OutputFile.from_filename(output_file.name)