librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp
  16 from shutil import rmtree
  17
  18 import sys
  19
  20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
  21 from librarian.dcparser import BookInfo
  22
  23 from librarian import functions, get_resource
  24
  25 functions.reg_person_name()
  26
  27
  28 def inner_xml(node):
  29     """ returns node's text and children as a string
  30
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  32     x<b>y</b>z
  33     """
  34
  35     nt = node.text if node.text is not None else ''
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
  37
  38 def set_inner_xml(node, text):
  39     """ sets node's text and children from a string
  40
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
  43     >>> print etree.tostring(e)
  44     <a>x<b>y</b>z</a>
  45     """
  46
  47     p = etree.fromstring('<x>%s</x>' % text)
  48     node.text = p.text
  49     node[:] = p[:]
  50
  51
  52 def node_name(node):
  53     """ Find out a node's name
  54
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  56     XYZ
  57     """
  58
  59     tempnode = deepcopy(node)
  60
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  62         for e in tempnode.findall('.//%s' % p):
  63             t = e.tail
  64             e.clear()
  65             e.tail = t
  66     etree.strip_tags(tempnode, '*')
  67     return tempnode.text
  68
  69
  70 def xslt(xml, sheet):
  71     if isinstance(xml, etree._Element):
  72         xml = etree.ElementTree(xml)
  73     with open(sheet) as xsltf:
  74         return xml.xslt(etree.parse(xsltf))
  75
  76
  77 def replace_characters(node):
  78     def replace_chars(text):
  79         if text is None:
  80             return None
  81         return text.replace(u"\ufeff", u"")\
  82                    .replace("---", u"\u2014")\
  83                    .replace("--", u"\u2013")\
  84                    .replace(",,", u"\u201E")\
  85                    .replace('"', u"\u201D")\
  86                    .replace("'", u"\u2019")
  87     if node.tag in ('uwaga', 'extra'):
  88         t = node.tail
  89         node.clear()
  90         node.tail = t
  91     node.text = replace_chars(node.text)
  92     node.tail = replace_chars(node.tail)
  93     for child in node:
  94         replace_characters(child)
  95
  96
  97 def find_annotations(annotations, source, part_no):
  98     for child in source:
  99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 100             annotation = deepcopy(child)
 101             number = str(len(annotations)+1)
 102             annotation.set('number', number)
 103             annotation.set('part', str(part_no))
 104             annotation.tail = ''
 105             annotations.append(annotation)
 106             tail = child.tail
 107             child.clear()
 108             child.tail = tail
 109             child.text = number
 110         if child.tag not in ('extra', 'uwaga'):
 111             find_annotations(annotations, child, part_no)
 112
 113
 114 def replace_by_verse(tree):
 115     """ Find stanzas and create new verses in place of a '/' character """
 116
 117     stanzas = tree.findall('.//' + WLNS('strofa'))
 118     for node in stanzas:
 119         for child_node in node:
 120             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 121                 foreign_verses = inner_xml(child_node).split('/\n')
 122                 if len(foreign_verses) > 1:
 123                     new_foreign = ''
 124                     for foreign_verse in foreign_verses:
 125                         if foreign_verse.startswith('<wers'):
 126                             new_foreign += foreign_verse
 127                         else:
 128                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 129                     set_inner_xml(child_node, new_foreign)
 130         verses = inner_xml(node).split('/\n')
 131         if len(verses) > 1:
 132             modified_inner_xml = ''
 133             for verse in verses:
 134                 if verse.startswith('<wers') or verse.startswith('<extra'):
 135                     modified_inner_xml += verse
 136                 else:
 137                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 138             set_inner_xml(node, modified_inner_xml)
 139
 140
 141 def add_to_manifest(manifest, partno):
 142     """ Adds a node to the manifest section in content.opf file """
 143
 144     partstr = 'part%d' % partno
 145     e = manifest.makeelement(OPFNS('item'), attrib={
 146                                  'id': partstr,
 147                                  'href': partstr + '.html',
 148                                  'media-type': 'application/xhtml+xml',
 149                              })
 150     manifest.append(e)
 151
 152
 153 def add_to_spine(spine, partno):
 154     """ Adds a node to the spine section in content.opf file """
 155
 156     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 157     spine.append(e)
 158
 159
 160 class TOC(object):
 161     def __init__(self, name=None, part_href=None):
 162         self.children = []
 163         self.name = name
 164         self.part_href = part_href
 165         self.sub_number = None
 166
 167     def add(self, name, part_href, level=0, is_part=True, index=None):
 168         assert level == 0 or index is None
 169         if level > 0 and self.children:
 170             return self.children[-1].add(name, part_href, level-1, is_part)
 171         else:
 172             t = TOC(name)
 173             t.part_href = part_href
 174             if index is not None:
 175                 self.children.insert(index, t)
 176             else:
 177                 self.children.append(t)
 178             if not is_part:
 179                 t.sub_number = len(self.children) + 1
 180                 return t.sub_number
 181
 182     def append(self, toc):
 183         self.children.append(toc)
 184
 185     def extend(self, toc):
 186         self.children.extend(toc.children)
 187
 188     def depth(self):
 189         if self.children:
 190             return max((c.depth() for c in self.children)) + 1
 191         else:
 192             return 0
 193
 194     def href(self):
 195         src = self.part_href
 196         if self.sub_number is not None:
 197             src += '#sub%d' % self.sub_number
 198         return src
 199
 200     def write_to_xml(self, nav_map, counter=1):
 201         for child in self.children:
 202             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 203             nav_point.set('id', 'NavPoint-%d' % counter)
 204             nav_point.set('playOrder', str(counter))
 205
 206             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 207             text = nav_map.makeelement(NCXNS('text'))
 208             text.text = child.name
 209             nav_label.append(text)
 210             nav_point.append(nav_label)
 211
 212             content = nav_map.makeelement(NCXNS('content'))
 213             content.set('src', child.href())
 214             nav_point.append(content)
 215             nav_map.append(nav_point)
 216             counter = child.write_to_xml(nav_point, counter + 1)
 217         return counter
 218
 219     def html_part(self, depth=0):
 220         texts = []
 221         for child in self.children:
 222             texts.append(
 223                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 224                 (depth, child.href(), child.name))
 225             texts.append(child.html_part(depth+1))
 226         return "\n".join(texts)
 227
 228     def html(self):
 229         with open(get_resource('epub/toc.html')) as f:
 230             t = unicode(f.read(), 'utf-8')
 231         return t % self.html_part()
 232
 233
 234 def used_chars(element):
 235     """ Lists characters used in an ETree Element """
 236     chars = set((element.text or '') + (element.tail or ''))
 237     for child in element:
 238         chars = chars.union(used_chars(child))
 239     return chars
 240
 241
 242 def chop(main_text):
 243     """ divide main content of the XML file into chunks """
 244
 245     # prepare a container for each chunk
 246     part_xml = etree.Element('utwor')
 247     etree.SubElement(part_xml, 'master')
 248     main_xml_part = part_xml[0] # master
 249
 250     last_node_part = False
 251     for one_part in main_text:
 252         name = one_part.tag
 253         if name == 'naglowek_czesc':
 254             yield part_xml
 255             last_node_part = True
 256             main_xml_part[:] = [deepcopy(one_part)]
 257         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 258             yield part_xml
 259             main_xml_part[:] = [deepcopy(one_part)]
 260         else:
 261             main_xml_part.append(deepcopy(one_part))
 262             last_node_part = False
 263     yield part_xml
 264
 265
 266 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 267     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 268
 269     toc = TOC()
 270     for element in chunk_xml[0]:
 271         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 272             toc.add(node_name(element), "part%d.html" % chunk_no)
 273         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 274             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 275             element.set('sub', str(subnumber))
 276     if empty:
 277         if not _empty_html_static:
 278             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 279         chars = set()
 280         output_html = _empty_html_static[0]
 281     else:
 282         find_annotations(annotations, chunk_xml, chunk_no)
 283         replace_by_verse(chunk_xml)
 284         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 285         chars = used_chars(html_tree.getroot())
 286         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 287     return output_html, toc, chars
 288
 289
 290 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
 291               style=None, html_toc=False,
 292               sample=None, cover=None, flags=None):
 293     """ produces a EPUB file
 294
 295     provider: a DocProvider
 296     slug: slug of file to process, available by provider
 297     output_file: file-like object or path to output file
 298     output_dir: path to directory to save output file to; either this or output_file must be present
 299     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 300     sample=n: generate sample e-book (with at least n paragraphs)
 301     cover: a cover.Cover object
 302     flags: less-advertising, without-fonts
 303     """
 304
 305     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
 306         """ processes one input file and proceeds to its children """
 307
 308         replace_characters(input_xml.getroot())
 309
 310         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 311
 312         # every input file will have a TOC entry,
 313         # pointing to starting chunk
 314         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), "part%d.html" % chunk_counter)
 315         chars = set()
 316         if first:
 317             # write book title page
 318             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
 319             chars = used_chars(html_tree.getroot())
 320             zip.writestr('OPS/title.html',
 321                  etree.tostring(html_tree, method="html", pretty_print=True))
 322             # add a title page TOC entry
 323             toc.add(u"Strona tytułowa", "title.html")
 324         elif children:
 325             # write title page for every parent
 326             if sample is not None and sample <= 0:
 327                 chars = set()
 328                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 329             else:
 330                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
 331                 chars = used_chars(html_tree.getroot())
 332                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 333             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 334             add_to_manifest(manifest, chunk_counter)
 335             add_to_spine(spine, chunk_counter)
 336             chunk_counter += 1
 337
 338         if len(input_xml.getroot()) > 1:
 339             # rdf before style master
 340             main_text = input_xml.getroot()[1]
 341         else:
 342             # rdf in style master
 343             main_text = input_xml.getroot()[0]
 344             if main_text.tag == RDFNS('RDF'):
 345                 main_text = None
 346
 347         if main_text is not None:
 348             for chunk_xml in chop(main_text):
 349                 empty = False
 350                 if sample is not None:
 351                     if sample <= 0:
 352                         empty = True
 353                     else:
 354                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 355                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 356
 357                 toc.extend(chunk_toc)
 358                 chars = chars.union(chunk_chars)
 359                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 360                 add_to_manifest(manifest, chunk_counter)
 361                 add_to_spine(spine, chunk_counter)
 362                 chunk_counter += 1
 363
 364         if children:
 365             for child in children:
 366                 child_xml = etree.parse(provider.by_uri(child))
 367                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
 368                 toc.append(child_toc)
 369                 chars = chars.union(chunk_chars)
 370
 371         return toc, chunk_counter, chars, sample
 372
 373     # read metadata from the first file
 374     if file_path:
 375         if slug:
 376             raise ValueError('slug or file_path should be specified, not both')
 377         f = open(file_path, 'r')
 378         input_xml = etree.parse(f)
 379         f.close()
 380     else:
 381         if not slug:
 382             raise ValueError('either slug or file_path should be specified')
 383         input_xml = etree.parse(provider[slug])
 384
 385     if flags:
 386         for flag in flags:
 387             input_xml.getroot().set(flag, 'yes')
 388
 389     metadata = input_xml.find('.//'+RDFNS('Description'))
 390     if metadata is None:
 391         raise NoDublinCore('Document has no DublinCore - which is required.')
 392     book_info = BookInfo.from_element(input_xml)
 393     metadata = etree.ElementTree(metadata)
 394
 395     # if output to dir, create the file
 396     if output_dir is not None:
 397         if make_dir:
 398             author = unicode(book_info.author)
 399             output_dir = os.path.join(output_dir, author)
 400             try:
 401                 os.makedirs(output_dir)
 402             except OSError:
 403                 pass
 404         if slug:
 405             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 406         else:
 407             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 408
 409     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 410
 411     # write static elements
 412     mime = zipfile.ZipInfo()
 413     mime.filename = 'mimetype'
 414     mime.compress_type = zipfile.ZIP_STORED
 415     mime.extra = ''
 416     zip.writestr(mime, 'application/epub+zip')
 417     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 418                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 419                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 420                        'media-type="application/oebps-package+xml" />' \
 421                        '</rootfiles></container>')
 422     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 423     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 424     if not style:
 425         style = get_resource('epub/style.css')
 426     zip.write(style, os.path.join('OPS', 'style.css'))
 427
 428     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
 429     manifest = opf.find('.//' + OPFNS('manifest'))
 430     guide = opf.find('.//' + OPFNS('guide'))
 431     spine = opf.find('.//' + OPFNS('spine'))
 432
 433     if cover:
 434         cover_file = StringIO()
 435         c = cover(book_info.author.readable(), book_info.title)
 436         c.save(cover_file)
 437         c_name = 'cover.%s' % c.ext()
 438         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 439         del cover_file
 440
 441         cover_tree = etree.parse(get_resource('epub/cover.html'))
 442         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 443         zip.writestr('OPS/cover.html', etree.tostring(
 444                         cover_tree, method="html", pretty_print=True))
 445
 446         manifest.append(etree.fromstring(
 447             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 448         manifest.append(etree.fromstring(
 449             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 450         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 451         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 452         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 453
 454
 455     annotations = etree.Element('annotations')
 456
 457     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 458                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 459                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 460                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 461                                '</navMap></ncx>')
 462     nav_map = toc_file[-1]
 463
 464     if html_toc:
 465         manifest.append(etree.fromstring(
 466             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 467         spine.append(etree.fromstring(
 468             '<itemref idref="html_toc" />'))
 469         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
 470
 471     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
 472
 473     if len(toc.children) < 2:
 474         toc.add(u"Początek utworu", "part1.html")
 475
 476     # Last modifications in container files and EPUB creation
 477     if len(annotations) > 0:
 478         toc.add("Przypisy", "annotations.html")
 479         manifest.append(etree.fromstring(
 480             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 481         spine.append(etree.fromstring(
 482             '<itemref idref="annotations" />'))
 483         replace_by_verse(annotations)
 484         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 485         chars = chars.union(used_chars(html_tree.getroot()))
 486         zip.writestr('OPS/annotations.html', etree.tostring(
 487                             html_tree, method="html", pretty_print=True))
 488
 489     toc.add("Strona redakcyjna", "last.html")
 490     manifest.append(etree.fromstring(
 491         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 492     spine.append(etree.fromstring(
 493         '<itemref idref="last" />'))
 494     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
 495     chars.update(used_chars(html_tree.getroot()))
 496     zip.writestr('OPS/last.html', etree.tostring(
 497                         html_tree, method="html", pretty_print=True))
 498
 499     if not flags or not 'without-fonts' in flags:
 500         # strip fonts
 501         tmpdir = mkdtemp('-librarian-epub')
 502         cwd = os.getcwd()
 503
 504         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 505         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 506             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 507                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 508             if verbose:
 509                 print "Running font-optimizer"
 510                 subprocess.check_call(optimizer_call)
 511             else:
 512                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 513             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 514             manifest.append(etree.fromstring(
 515                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 516         rmtree(tmpdir)
 517         os.chdir(cwd)
 518
 519     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 520     contents = []
 521     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 522     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 523     for st in attributes:
 524         meta = toc_file.makeelement(NCXNS('meta'))
 525         meta.set('name', st)
 526         meta.set('content', '0')
 527         toc_file[0].append(meta)
 528     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 529     toc_file[0][1].set('content', str(toc.depth()))
 530     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 531
 532     # write TOC
 533     if html_toc:
 534         toc.add(u"Spis treści", "toc.html", index=1)
 535         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 536     toc.write_to_xml(nav_map)
 537     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 538     zip.close()