librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp
  16 from shutil import rmtree
  17
  18 import sys
  19
  20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
  21 from librarian.dcparser import BookInfo
  22
  23 from librarian import functions, get_resource
  24
  25 functions.reg_person_name()
  26
  27
  28 def inner_xml(node):
  29     """ returns node's text and children as a string
  30
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  32     x<b>y</b>z
  33     """
  34
  35     nt = node.text if node.text is not None else ''
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
  37
  38 def set_inner_xml(node, text):
  39     """ sets node's text and children from a string
  40
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
  43     >>> print etree.tostring(e)
  44     <a>x<b>y</b>z</a>
  45     """
  46
  47     p = etree.fromstring('<x>%s</x>' % text)
  48     node.text = p.text
  49     node[:] = p[:]
  50
  51
  52 def node_name(node):
  53     """ Find out a node's name
  54
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  56     XYZ
  57     """
  58
  59     tempnode = deepcopy(node)
  60
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  62         for e in tempnode.findall('.//%s' % p):
  63             t = e.tail
  64             e.clear()
  65             e.tail = t
  66     etree.strip_tags(tempnode, '*')
  67     return tempnode.text
  68
  69
  70 def xslt(xml, sheet):
  71     if isinstance(xml, etree._Element):
  72         xml = etree.ElementTree(xml)
  73     with open(sheet) as xsltf:
  74         return xml.xslt(etree.parse(xsltf))
  75
  76
  77 def replace_characters(node):
  78     def replace_chars(text):
  79         if text is None:
  80             return None
  81         return text.replace(u"\ufeff", u"")\
  82                    .replace("---", u"\u2014")\
  83                    .replace("--", u"\u2013")\
  84                    .replace(",,", u"\u201E")\
  85                    .replace('"', u"\u201D")\
  86                    .replace("'", u"\u2019")
  87     if node.tag in ('uwaga', 'extra'):
  88         t = node.tail
  89         node.clear()
  90         node.tail = t
  91     node.text = replace_chars(node.text)
  92     node.tail = replace_chars(node.tail)
  93     for child in node:
  94         replace_characters(child)
  95
  96
  97 def find_annotations(annotations, source, part_no):
  98     for child in source:
  99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 100             annotation = deepcopy(child)
 101             number = str(len(annotations)+1)
 102             annotation.set('number', number)
 103             annotation.set('part', str(part_no))
 104             annotation.tail = ''
 105             annotations.append(annotation)
 106             tail = child.tail
 107             child.clear()
 108             child.tail = tail
 109             child.text = number
 110         if child.tag not in ('extra', 'uwaga'):
 111             find_annotations(annotations, child, part_no)
 112
 113
 114 def replace_by_verse(tree):
 115     """ Find stanzas and create new verses in place of a '/' character """
 116
 117     stanzas = tree.findall('.//' + WLNS('strofa'))
 118     for node in stanzas:
 119         for child_node in node:
 120             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 121                 foreign_verses = inner_xml(child_node).split('/\n')
 122                 if len(foreign_verses) > 1:
 123                     new_foreign = ''
 124                     for foreign_verse in foreign_verses:
 125                         if foreign_verse.startswith('<wers'):
 126                             new_foreign += foreign_verse
 127                         else:
 128                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 129                     set_inner_xml(child_node, new_foreign)
 130         verses = inner_xml(node).split('/\n')
 131         if len(verses) > 1:
 132             modified_inner_xml = ''
 133             for verse in verses:
 134                 if verse.startswith('<wers') or verse.startswith('<extra'):
 135                     modified_inner_xml += verse
 136                 else:
 137                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 138             set_inner_xml(node, modified_inner_xml)
 139
 140
 141 def add_to_manifest(manifest, partno):
 142     """ Adds a node to the manifest section in content.opf file """
 143
 144     partstr = 'part%d' % partno
 145     e = manifest.makeelement(OPFNS('item'), attrib={
 146                                  'id': partstr,
 147                                  'href': partstr + '.html',
 148                                  'media-type': 'application/xhtml+xml',
 149                              })
 150     manifest.append(e)
 151
 152
 153 def add_to_spine(spine, partno):
 154     """ Adds a node to the spine section in content.opf file """
 155
 156     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 157     spine.append(e)
 158
 159
 160 class TOC(object):
 161     def __init__(self, name=None, part_number=None):
 162         self.children = []
 163         self.name = name
 164         self.part_number = part_number
 165         self.sub_number = None
 166
 167     def add(self, name, part_number, level=0, is_part=True):
 168         if level > 0 and self.children:
 169             return self.children[-1].add(name, part_number, level-1, is_part)
 170         else:
 171             t = TOC(name)
 172             t.part_number = part_number
 173             self.children.append(t)
 174             if not is_part:
 175                 t.sub_number = len(self.children) + 1
 176                 return t.sub_number
 177
 178     def append(self, toc):
 179         self.children.append(toc)
 180
 181     def extend(self, toc):
 182         self.children.extend(toc.children)
 183
 184     def depth(self):
 185         if self.children:
 186             return max((c.depth() for c in self.children)) + 1
 187         else:
 188             return 0
 189
 190     def write_to_xml(self, nav_map, counter):
 191         for child in self.children:
 192             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 193             nav_point.set('id', 'NavPoint-%d' % counter)
 194             nav_point.set('playOrder', str(counter))
 195
 196             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 197             text = nav_map.makeelement(NCXNS('text'))
 198             text.text = child.name
 199             nav_label.append(text)
 200             nav_point.append(nav_label)
 201
 202             content = nav_map.makeelement(NCXNS('content'))
 203             src = 'part%d.html' % child.part_number
 204             if child.sub_number is not None:
 205                 src += '#sub%d' % child.sub_number
 206             content.set('src', src)
 207             nav_point.append(content)
 208             nav_map.append(nav_point)
 209             counter = child.write_to_xml(nav_point, counter + 1)
 210         return counter
 211
 212
 213 def used_chars(element):
 214     """ Lists characters used in an ETree Element """
 215     chars = set((element.text or '') + (element.tail or ''))
 216     for child in element:
 217         chars = chars.union(used_chars(child))
 218     return chars
 219
 220
 221 def chop(main_text):
 222     """ divide main content of the XML file into chunks """
 223
 224     # prepare a container for each chunk
 225     part_xml = etree.Element('utwor')
 226     etree.SubElement(part_xml, 'master')
 227     main_xml_part = part_xml[0] # master
 228
 229     last_node_part = False
 230     for one_part in main_text:
 231         name = one_part.tag
 232         if name == 'naglowek_czesc':
 233             yield part_xml
 234             last_node_part = True
 235             main_xml_part[:] = [deepcopy(one_part)]
 236         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 237             yield part_xml
 238             main_xml_part[:] = [deepcopy(one_part)]
 239         else:
 240             main_xml_part.append(deepcopy(one_part))
 241             last_node_part = False
 242     yield part_xml
 243
 244
 245 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 246     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 247
 248     toc = TOC()
 249     for element in chunk_xml[0]:
 250         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 251             toc.add(node_name(element), chunk_no)
 252         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 253             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 254             element.set('sub', str(subnumber))
 255     if empty:
 256         if not _empty_html_static:
 257             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 258         chars = set()
 259         output_html = _empty_html_static[0]
 260     else:
 261         find_annotations(annotations, chunk_xml, chunk_no)
 262         replace_by_verse(chunk_xml)
 263         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 264         chars = used_chars(html_tree.getroot())
 265         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 266     return output_html, toc, chars
 267
 268
 269 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
 270               style=None,
 271               sample=None, cover=None, flags=None):
 272     """ produces a EPUB file
 273
 274     provider: a DocProvider
 275     slug: slug of file to process, available by provider
 276     output_file: file-like object or path to output file
 277     output_dir: path to directory to save output file to; either this or output_file must be present
 278     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 279     sample=n: generate sample e-book (with at least n paragraphs)
 280     cover: a cover.Cover object
 281     flags: less-advertising, without-fonts
 282     """
 283
 284     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
 285         """ processes one input file and proceeds to its children """
 286
 287         replace_characters(input_xml.getroot())
 288
 289         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 290
 291         # every input file will have a TOC entry,
 292         # pointing to starting chunk
 293         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 294         chars = set()
 295         if first:
 296             # write book title page
 297             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
 298             chars = used_chars(html_tree.getroot())
 299             zip.writestr('OPS/title.html',
 300                  etree.tostring(html_tree, method="html", pretty_print=True))
 301         elif children:
 302             # write title page for every parent
 303             if sample is not None and sample <= 0:
 304                 chars = set()
 305                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 306             else:
 307                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
 308                 chars = used_chars(html_tree.getroot())
 309                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 310             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 311             add_to_manifest(manifest, chunk_counter)
 312             add_to_spine(spine, chunk_counter)
 313             chunk_counter += 1
 314
 315         if len(input_xml.getroot()) > 1:
 316             # rdf before style master
 317             main_text = input_xml.getroot()[1]
 318         else:
 319             # rdf in style master
 320             main_text = input_xml.getroot()[0]
 321             if main_text.tag == RDFNS('RDF'):
 322                 main_text = None
 323
 324         if main_text is not None:
 325             for chunk_xml in chop(main_text):
 326                 empty = False
 327                 if sample is not None:
 328                     if sample <= 0:
 329                         empty = True
 330                     else:
 331                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 332                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 333
 334                 toc.extend(chunk_toc)
 335                 chars = chars.union(chunk_chars)
 336                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 337                 add_to_manifest(manifest, chunk_counter)
 338                 add_to_spine(spine, chunk_counter)
 339                 chunk_counter += 1
 340
 341         if children:
 342             for child in children:
 343                 child_xml = etree.parse(provider.by_uri(child))
 344                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
 345                 toc.append(child_toc)
 346                 chars = chars.union(chunk_chars)
 347
 348         return toc, chunk_counter, chars, sample
 349
 350     # read metadata from the first file
 351     if file_path:
 352         if slug:
 353             raise ValueError('slug or file_path should be specified, not both')
 354         f = open(file_path, 'r')
 355         input_xml = etree.parse(f)
 356         f.close()
 357     else:
 358         if not slug:
 359             raise ValueError('either slug or file_path should be specified')
 360         input_xml = etree.parse(provider[slug])
 361
 362     if flags:
 363         for flag in flags:
 364             input_xml.getroot().set(flag, 'yes')
 365
 366     metadata = input_xml.find('.//'+RDFNS('Description'))
 367     if metadata is None:
 368         raise NoDublinCore('Document has no DublinCore - which is required.')
 369     book_info = BookInfo.from_element(input_xml)
 370     metadata = etree.ElementTree(metadata)
 371
 372     # if output to dir, create the file
 373     if output_dir is not None:
 374         if make_dir:
 375             author = unicode(book_info.author)
 376             output_dir = os.path.join(output_dir, author)
 377             try:
 378                 os.makedirs(output_dir)
 379             except OSError:
 380                 pass
 381         if slug:
 382             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 383         else:
 384             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 385
 386     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 387
 388     # write static elements
 389     mime = zipfile.ZipInfo()
 390     mime.filename = 'mimetype'
 391     mime.compress_type = zipfile.ZIP_STORED
 392     mime.extra = ''
 393     zip.writestr(mime, 'application/epub+zip')
 394     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 395                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 396                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 397                        'media-type="application/oebps-package+xml" />' \
 398                        '</rootfiles></container>')
 399     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 400     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 401     if not style:
 402         style = get_resource('epub/style.css')
 403     zip.write(style, os.path.join('OPS', 'style.css'))
 404
 405     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
 406     manifest = opf.find('.//' + OPFNS('manifest'))
 407     spine = opf.find('.//' + OPFNS('spine'))
 408
 409     if cover:
 410         cover_file = StringIO()
 411         c = cover(book_info.author.readable(), book_info.title)
 412         c.save(cover_file)
 413         c_name = 'cover.%s' % c.ext()
 414         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 415         del cover_file
 416
 417         cover_tree = etree.parse(get_resource('epub/cover.html'))
 418         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 419         zip.writestr('OPS/cover.html', etree.tostring(
 420                         cover_tree, method="html", pretty_print=True))
 421
 422         manifest.append(etree.fromstring(
 423             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 424         manifest.append(etree.fromstring(
 425             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 426         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
 427         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 428         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
 429
 430
 431     annotations = etree.Element('annotations')
 432
 433     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 434                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 435                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 436                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 437                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 438                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 439                                '</navPoint></navMap></ncx>')
 440     nav_map = toc_file[-1]
 441
 442     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
 443
 444     if not toc.children:
 445         toc.add(u"Początek utworu", 1)
 446     toc_counter = toc.write_to_xml(nav_map, 2)
 447
 448     # Last modifications in container files and EPUB creation
 449     if len(annotations) > 0:
 450         nav_map.append(etree.fromstring(
 451             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 452             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 453         toc_counter += 1
 454         manifest.append(etree.fromstring(
 455             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 456         spine.append(etree.fromstring(
 457             '<itemref idref="annotations" />'))
 458         replace_by_verse(annotations)
 459         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 460         chars = chars.union(used_chars(html_tree.getroot()))
 461         zip.writestr('OPS/annotations.html', etree.tostring(
 462                             html_tree, method="html", pretty_print=True))
 463
 464     nav_map.append(etree.fromstring(
 465         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
 466         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
 467     manifest.append(etree.fromstring(
 468         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 469     spine.append(etree.fromstring(
 470         '<itemref idref="last" />'))
 471     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
 472     chars.update(used_chars(html_tree.getroot()))
 473     zip.writestr('OPS/last.html', etree.tostring(
 474                         html_tree, method="html", pretty_print=True))
 475
 476     if not flags or not 'without-fonts' in flags:
 477         # strip fonts
 478         tmpdir = mkdtemp('-librarian-epub')
 479         cwd = os.getcwd()
 480
 481         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 482         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 483             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 484                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 485             if verbose:
 486                 print "Running font-optimizer"
 487                 subprocess.check_call(optimizer_call)
 488             else:
 489                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 490             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 491             manifest.append(etree.fromstring(
 492                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 493         rmtree(tmpdir)
 494         os.chdir(cwd)
 495
 496     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 497     contents = []
 498     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 499     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 500     for st in attributes:
 501         meta = toc_file.makeelement(NCXNS('meta'))
 502         meta.set('name', st)
 503         meta.set('content', '0')
 504         toc_file[0].append(meta)
 505     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 506     toc_file[0][1].set('content', str(toc.depth()))
 507     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 508     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 509     zip.close()