librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp
  16 from shutil import rmtree
  17
  18 import sys
  19
  20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
  21 from librarian.dcparser import BookInfo
  22
  23 from librarian import functions, get_resource
  24
  25 functions.reg_person_name()
  26
  27
  28 def inner_xml(node):
  29     """ returns node's text and children as a string
  30
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  32     x<b>y</b>z
  33     """
  34
  35     nt = node.text if node.text is not None else ''
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
  37
  38 def set_inner_xml(node, text):
  39     """ sets node's text and children from a string
  40
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
  43     >>> print etree.tostring(e)
  44     <a>x<b>y</b>z</a>
  45     """
  46
  47     p = etree.fromstring('<x>%s</x>' % text)
  48     node.text = p.text
  49     node[:] = p[:]
  50
  51
  52 def node_name(node):
  53     """ Find out a node's name
  54
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  56     XYZ
  57     """
  58
  59     tempnode = deepcopy(node)
  60
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  62         for e in tempnode.findall('.//%s' % p):
  63             t = e.tail
  64             e.clear()
  65             e.tail = t
  66     etree.strip_tags(tempnode, '*')
  67     return tempnode.text
  68
  69
  70 def xslt(xml, sheet):
  71     if isinstance(xml, etree._Element):
  72         xml = etree.ElementTree(xml)
  73     with open(sheet) as xsltf:
  74         return xml.xslt(etree.parse(xsltf))
  75
  76
  77 def replace_characters(node):
  78     def replace_chars(text):
  79         if text is None:
  80             return None
  81         return text.replace(u"\ufeff", u"")\
  82                    .replace("---", u"\u2014")\
  83                    .replace("--", u"\u2013")\
  84                    .replace(",,", u"\u201E")\
  85                    .replace('"', u"\u201D")\
  86                    .replace("'", u"\u2019")
  87     if node.tag == 'extra':
  88         node.clear()
  89     else:
  90         node.text = replace_chars(node.text)
  91         node.tail = replace_chars(node.tail)
  92         for child in node:
  93             replace_characters(child)
  94
  95
  96 def find_annotations(annotations, source, part_no):
  97     for child in source:
  98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  99             annotation = deepcopy(child)
 100             number = str(len(annotations)+1)
 101             annotation.set('number', number)
 102             annotation.set('part', str(part_no))
 103             annotation.tail = ''
 104             annotations.append(annotation)
 105             tail = child.tail
 106             child.clear()
 107             child.tail = tail
 108             child.text = number
 109         if child.tag not in ('extra',):
 110             find_annotations(annotations, child, part_no)
 111
 112
 113 def replace_by_verse(tree):
 114     """ Find stanzas and create new verses in place of a '/' character """
 115
 116     stanzas = tree.findall('.//' + WLNS('strofa'))
 117     for node in stanzas:
 118         for child_node in node:
 119             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 120                 foreign_verses = inner_xml(child_node).split('/\n')
 121                 if len(foreign_verses) > 1:
 122                     new_foreign = ''
 123                     for foreign_verse in foreign_verses:
 124                         if foreign_verse.startswith('<wers'):
 125                             new_foreign += foreign_verse
 126                         else:
 127                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 128                     set_inner_xml(child_node, new_foreign)
 129         verses = inner_xml(node).split('/\n')
 130         if len(verses) > 1:
 131             modified_inner_xml = ''
 132             for verse in verses:
 133                 if verse.startswith('<wers') or verse.startswith('<extra'):
 134                     modified_inner_xml += verse
 135                 else:
 136                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 137             set_inner_xml(node, modified_inner_xml)
 138
 139
 140 def add_to_manifest(manifest, partno):
 141     """ Adds a node to the manifest section in content.opf file """
 142
 143     partstr = 'part%d' % partno
 144     e = manifest.makeelement(OPFNS('item'), attrib={
 145                                  'id': partstr,
 146                                  'href': partstr + '.html',
 147                                  'media-type': 'application/xhtml+xml',
 148                              })
 149     manifest.append(e)
 150
 151
 152 def add_to_spine(spine, partno):
 153     """ Adds a node to the spine section in content.opf file """
 154
 155     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 156     spine.append(e)
 157
 158
 159 class TOC(object):
 160     def __init__(self, name=None, part_number=None):
 161         self.children = []
 162         self.name = name
 163         self.part_number = part_number
 164         self.sub_number = None
 165
 166     def add(self, name, part_number, level=0, is_part=True):
 167         if level > 0 and self.children:
 168             return self.children[-1].add(name, part_number, level-1, is_part)
 169         else:
 170             t = TOC(name)
 171             t.part_number = part_number
 172             self.children.append(t)
 173             if not is_part:
 174                 t.sub_number = len(self.children) + 1
 175                 return t.sub_number
 176
 177     def append(self, toc):
 178         self.children.append(toc)
 179
 180     def extend(self, toc):
 181         self.children.extend(toc.children)
 182
 183     def depth(self):
 184         if self.children:
 185             return max((c.depth() for c in self.children)) + 1
 186         else:
 187             return 0
 188
 189     def write_to_xml(self, nav_map, counter):
 190         for child in self.children:
 191             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 192             nav_point.set('id', 'NavPoint-%d' % counter)
 193             nav_point.set('playOrder', str(counter))
 194
 195             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 196             text = nav_map.makeelement(NCXNS('text'))
 197             text.text = child.name
 198             nav_label.append(text)
 199             nav_point.append(nav_label)
 200
 201             content = nav_map.makeelement(NCXNS('content'))
 202             src = 'part%d.html' % child.part_number
 203             if child.sub_number is not None:
 204                 src += '#sub%d' % child.sub_number
 205             content.set('src', src)
 206             nav_point.append(content)
 207             nav_map.append(nav_point)
 208             counter = child.write_to_xml(nav_point, counter + 1)
 209         return counter
 210
 211
 212 def used_chars(element):
 213     """ Lists characters used in an ETree Element """
 214     chars = set((element.text or '') + (element.tail or ''))
 215     for child in element:
 216         chars = chars.union(used_chars(child))
 217     return chars
 218
 219
 220 def chop(main_text):
 221     """ divide main content of the XML file into chunks """
 222
 223     # prepare a container for each chunk
 224     part_xml = etree.Element('utwor')
 225     etree.SubElement(part_xml, 'master')
 226     main_xml_part = part_xml[0] # master
 227
 228     last_node_part = False
 229     for one_part in main_text:
 230         name = one_part.tag
 231         if name == 'naglowek_czesc':
 232             yield part_xml
 233             last_node_part = True
 234             main_xml_part[:] = [deepcopy(one_part)]
 235         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 236             yield part_xml
 237             main_xml_part[:] = [deepcopy(one_part)]
 238         else:
 239             main_xml_part.append(deepcopy(one_part))
 240             last_node_part = False
 241     yield part_xml
 242
 243
 244 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 245     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 246
 247     toc = TOC()
 248     for element in chunk_xml[0]:
 249         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 250             toc.add(node_name(element), chunk_no)
 251         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 252             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 253             element.set('sub', str(subnumber))
 254     if empty:
 255         if not _empty_html_static:
 256             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 257         chars = set()
 258         output_html = _empty_html_static[0]
 259     else:
 260         find_annotations(annotations, chunk_xml, chunk_no)
 261         replace_by_verse(chunk_xml)
 262         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 263         chars = used_chars(html_tree.getroot())
 264         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 265     return output_html, toc, chars
 266
 267
 268 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
 269               sample=None, cover=None, flags=None):
 270     """ produces a EPUB file
 271
 272     provider: a DocProvider
 273     slug: slug of file to process, available by provider
 274     output_file: file-like object or path to output file
 275     output_dir: path to directory to save output file to; either this or output_file must be present
 276     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 277     sample=n: generate sample e-book (with at least n paragraphs)
 278     cover: a cover.Cover object
 279     flags: less-advertising,
 280     """
 281
 282     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
 283         """ processes one input file and proceeds to its children """
 284
 285         replace_characters(input_xml.getroot())
 286
 287         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 288
 289         # every input file will have a TOC entry,
 290         # pointing to starting chunk
 291         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 292         chars = set()
 293         if first:
 294             # write book title page
 295             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
 296             chars = used_chars(html_tree.getroot())
 297             zip.writestr('OPS/title.html',
 298                  etree.tostring(html_tree, method="html", pretty_print=True))
 299         elif children:
 300             # write title page for every parent
 301             if sample is not None and sample <= 0:
 302                 chars = set()
 303                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 304             else:
 305                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
 306                 chars = used_chars(html_tree.getroot())
 307                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 308             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 309             add_to_manifest(manifest, chunk_counter)
 310             add_to_spine(spine, chunk_counter)
 311             chunk_counter += 1
 312
 313         if len(input_xml.getroot()) > 1:
 314             # rdf before style master
 315             main_text = input_xml.getroot()[1]
 316         else:
 317             # rdf in style master
 318             main_text = input_xml.getroot()[0]
 319             if main_text.tag == RDFNS('RDF'):
 320                 main_text = None
 321
 322         if main_text is not None:
 323             for chunk_xml in chop(main_text):
 324                 empty = False
 325                 if sample is not None:
 326                     if sample <= 0:
 327                         empty = True
 328                     else:
 329                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 330                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 331
 332                 toc.extend(chunk_toc)
 333                 chars = chars.union(chunk_chars)
 334                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 335                 add_to_manifest(manifest, chunk_counter)
 336                 add_to_spine(spine, chunk_counter)
 337                 chunk_counter += 1
 338
 339         if children:
 340             for child in children:
 341                 child_xml = etree.parse(provider.by_uri(child))
 342                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
 343                 toc.append(child_toc)
 344                 chars = chars.union(chunk_chars)
 345
 346         return toc, chunk_counter, chars, sample
 347
 348     # read metadata from the first file
 349     if file_path:
 350         if slug:
 351             raise ValueError('slug or file_path should be specified, not both')
 352         f = open(file_path, 'r')
 353         input_xml = etree.parse(f)
 354         f.close()
 355     else:
 356         if not slug:
 357             raise ValueError('either slug or file_path should be specified')
 358         input_xml = etree.parse(provider[slug])
 359
 360     if flags:
 361         for flag in flags:
 362             input_xml.getroot().set(flag, 'yes')
 363
 364     metadata = input_xml.find('.//'+RDFNS('Description'))
 365     if metadata is None:
 366         raise NoDublinCore('Document has no DublinCore - which is required.')
 367     book_info = BookInfo.from_element(input_xml)
 368     metadata = etree.ElementTree(metadata)
 369
 370     # if output to dir, create the file
 371     if output_dir is not None:
 372         if make_dir:
 373             author = unicode(book_info.author)
 374             output_dir = os.path.join(output_dir, author)
 375             try:
 376                 os.makedirs(output_dir)
 377             except OSError:
 378                 pass
 379         if slug:
 380             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 381         else:
 382             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 383
 384     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 385
 386     # write static elements
 387     mime = zipfile.ZipInfo()
 388     mime.filename = 'mimetype'
 389     mime.compress_type = zipfile.ZIP_STORED
 390     mime.extra = ''
 391     zip.writestr(mime, 'application/epub+zip')
 392     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 393                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 394                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 395                        'media-type="application/oebps-package+xml" />' \
 396                        '</rootfiles></container>')
 397     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
 398     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 399
 400     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
 401     manifest = opf.find('.//' + OPFNS('manifest'))
 402     spine = opf.find('.//' + OPFNS('spine'))
 403
 404     if cover:
 405         cover_file = StringIO()
 406         c = cover(book_info.author.readable(), book_info.title)
 407         c.save(cover_file)
 408         c_name = 'cover.%s' % c.ext()
 409         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 410         del cover_file
 411
 412         cover_tree = etree.parse(get_resource('epub/cover.html'))
 413         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 414         zip.writestr('OPS/cover.html', etree.tostring(
 415                         cover_tree, method="html", pretty_print=True))
 416
 417         manifest.append(etree.fromstring(
 418             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 419         manifest.append(etree.fromstring(
 420             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 421         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
 422         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 423         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
 424
 425
 426     annotations = etree.Element('annotations')
 427
 428     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 429                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 430                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 431                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 432                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 433                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 434                                '</navPoint></navMap></ncx>')
 435     nav_map = toc_file[-1]
 436
 437     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
 438
 439     if not toc.children:
 440         toc.add(u"Początek utworu", 1)
 441     toc_counter = toc.write_to_xml(nav_map, 2)
 442
 443     # Last modifications in container files and EPUB creation
 444     if len(annotations) > 0:
 445         nav_map.append(etree.fromstring(
 446             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 447             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 448         toc_counter += 1
 449         manifest.append(etree.fromstring(
 450             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 451         spine.append(etree.fromstring(
 452             '<itemref idref="annotations" />'))
 453         replace_by_verse(annotations)
 454         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 455         chars = chars.union(used_chars(html_tree.getroot()))
 456         zip.writestr('OPS/annotations.html', etree.tostring(
 457                             html_tree, method="html", pretty_print=True))
 458
 459     nav_map.append(etree.fromstring(
 460         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
 461         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
 462     manifest.append(etree.fromstring(
 463         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 464     spine.append(etree.fromstring(
 465         '<itemref idref="last" />'))
 466     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
 467     chars.update(used_chars(html_tree.getroot()))
 468     zip.writestr('OPS/last.html', etree.tostring(
 469                         html_tree, method="html", pretty_print=True))
 470
 471     # strip fonts
 472     tmpdir = mkdtemp('-librarian-epub')
 473     cwd = os.getcwd()
 474
 475     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 476     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 477         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 478                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 479         if verbose:
 480             print "Running font-optimizer"
 481             subprocess.check_call(optimizer_call)
 482         else:
 483             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 484         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 485     rmtree(tmpdir)
 486     os.chdir(cwd)
 487
 488     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 489     contents = []
 490     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 491     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 492     for st in attributes:
 493         meta = toc_file.makeelement(NCXNS('meta'))
 494         meta.set('name', st)
 495         meta.set('content', '0')
 496         toc_file[0].append(meta)
 497     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 498     toc_file[0][1].set('content', str(toc.depth()))
 499     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 500     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 501     zip.close()