librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 from copy import deepcopy
   9 import os
  10 import os.path
  11 import subprocess
  12 from StringIO import StringIO
  13 from copy import deepcopy
  14 from lxml import etree
  15 import zipfile
  16 from tempfile import mkdtemp
  17 from shutil import rmtree
  18
  19 import sys
  20
  21 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
  22 from librarian.dcparser import BookInfo
  23 from librarian.cover import ImageCover
  24
  25 from librarian import functions, get_resource
  26
  27 functions.reg_person_name()
  28
  29
  30 def inner_xml(node):
  31     """ returns node's text and children as a string
  32
  33     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  34     x<b>y</b>z
  35     """
  36
  37     nt = node.text if node.text is not None else ''
  38     return ''.join([nt] + [etree.tostring(child) for child in node])
  39
  40 def set_inner_xml(node, text):
  41     """ sets node's text and children from a string
  42
  43     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  44     >>> set_inner_xml(e, 'x<b>y</b>z')
  45     >>> print etree.tostring(e)
  46     <a>x<b>y</b>z</a>
  47     """
  48
  49     p = etree.fromstring('<x>%s</x>' % text)
  50     node.text = p.text
  51     node[:] = p[:]
  52
  53
  54 def node_name(node):
  55     """ Find out a node's name
  56
  57     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  58     XYZ
  59     """
  60
  61     tempnode = deepcopy(node)
  62
  63     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  64         for e in tempnode.findall('.//%s' % p):
  65             t = e.tail
  66             e.clear()
  67             e.tail = t
  68     etree.strip_tags(tempnode, '*')
  69     return tempnode.text
  70
  71
  72 def xslt(xml, sheet):
  73     if isinstance(xml, etree._Element):
  74         xml = etree.ElementTree(xml)
  75     with open(sheet) as xsltf:
  76         return xml.xslt(etree.parse(xsltf))
  77
  78
  79 def replace_characters(node):
  80     def replace_chars(text):
  81         if text is None:
  82             return None
  83         return text.replace(u"\ufeff", u"")\
  84                    .replace("---", u"\u2014")\
  85                    .replace("--", u"\u2013")\
  86                    .replace(",,", u"\u201E")\
  87                    .replace('"', u"\u201D")\
  88                    .replace("'", u"\u2019")
  89     if node.tag in ('uwaga', 'extra'):
  90         t = node.tail
  91         node.clear()
  92         node.tail = t
  93     node.text = replace_chars(node.text)
  94     node.tail = replace_chars(node.tail)
  95     for child in node:
  96         replace_characters(child)
  97
  98
  99 def find_annotations(annotations, source, part_no):
 100     for child in source:
 101         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 102             annotation = deepcopy(child)
 103             number = str(len(annotations)+1)
 104             annotation.set('number', number)
 105             annotation.set('part', str(part_no))
 106             annotation.tail = ''
 107             annotations.append(annotation)
 108             tail = child.tail
 109             child.clear()
 110             child.tail = tail
 111             child.text = number
 112         if child.tag not in ('extra', 'uwaga'):
 113             find_annotations(annotations, child, part_no)
 114
 115
 116 def replace_by_verse(tree):
 117     """ Find stanzas and create new verses in place of a '/' character """
 118
 119     stanzas = tree.findall('.//' + WLNS('strofa'))
 120     for node in stanzas:
 121         for child_node in node:
 122             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 123                 foreign_verses = inner_xml(child_node).split('/\n')
 124                 if len(foreign_verses) > 1:
 125                     new_foreign = ''
 126                     for foreign_verse in foreign_verses:
 127                         if foreign_verse.startswith('<wers'):
 128                             new_foreign += foreign_verse
 129                         else:
 130                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 131                     set_inner_xml(child_node, new_foreign)
 132         verses = inner_xml(node).split('/\n')
 133         if len(verses) > 1:
 134             modified_inner_xml = ''
 135             for verse in verses:
 136                 if verse.startswith('<wers') or verse.startswith('<extra'):
 137                     modified_inner_xml += verse
 138                 else:
 139                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 140             set_inner_xml(node, modified_inner_xml)
 141
 142
 143 def add_to_manifest(manifest, partno):
 144     """ Adds a node to the manifest section in content.opf file """
 145
 146     partstr = 'part%d' % partno
 147     e = manifest.makeelement(OPFNS('item'), attrib={
 148                                  'id': partstr,
 149                                  'href': partstr + '.html',
 150                                  'media-type': 'application/xhtml+xml',
 151                              })
 152     manifest.append(e)
 153
 154
 155 def add_to_spine(spine, partno):
 156     """ Adds a node to the spine section in content.opf file """
 157
 158     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 159     spine.append(e)
 160
 161
 162 class TOC(object):
 163     def __init__(self, name=None, part_number=None):
 164         self.children = []
 165         self.name = name
 166         self.part_number = part_number
 167         self.sub_number = None
 168
 169     def add(self, name, part_number, level=0, is_part=True):
 170         if level > 0 and self.children:
 171             return self.children[-1].add(name, part_number, level-1, is_part)
 172         else:
 173             t = TOC(name)
 174             t.part_number = part_number
 175             self.children.append(t)
 176             if not is_part:
 177                 t.sub_number = len(self.children) + 1
 178                 return t.sub_number
 179
 180     def append(self, toc):
 181         self.children.append(toc)
 182
 183     def extend(self, toc):
 184         self.children.extend(toc.children)
 185
 186     def depth(self):
 187         if self.children:
 188             return max((c.depth() for c in self.children)) + 1
 189         else:
 190             return 0
 191
 192     def write_to_xml(self, nav_map, counter):
 193         for child in self.children:
 194             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 195             nav_point.set('id', 'NavPoint-%d' % counter)
 196             nav_point.set('playOrder', str(counter))
 197
 198             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 199             text = nav_map.makeelement(NCXNS('text'))
 200             text.text = child.name
 201             nav_label.append(text)
 202             nav_point.append(nav_label)
 203
 204             content = nav_map.makeelement(NCXNS('content'))
 205             src = 'part%d.html' % child.part_number
 206             if child.sub_number is not None:
 207                 src += '#sub%d' % child.sub_number
 208             content.set('src', src)
 209             nav_point.append(content)
 210             nav_map.append(nav_point)
 211             counter = child.write_to_xml(nav_point, counter + 1)
 212         return counter
 213
 214
 215 def used_chars(element):
 216     """ Lists characters used in an ETree Element """
 217     chars = set((element.text or '') + (element.tail or ''))
 218     for child in element:
 219         chars = chars.union(used_chars(child))
 220     return chars
 221
 222
 223 def chop(main_text):
 224     """ divide main content of the XML file into chunks """
 225
 226     # prepare a container for each chunk
 227     part_xml = etree.Element('utwor')
 228     etree.SubElement(part_xml, 'master')
 229     main_xml_part = part_xml[0] # master
 230
 231     last_node_part = False
 232     for one_part in main_text:
 233         name = one_part.tag
 234         if name == 'naglowek_czesc':
 235             yield part_xml
 236             last_node_part = True
 237             main_xml_part[:] = [deepcopy(one_part)]
 238         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 239             yield part_xml
 240             main_xml_part[:] = [deepcopy(one_part)]
 241         else:
 242             main_xml_part.append(deepcopy(one_part))
 243             last_node_part = False
 244     yield part_xml
 245
 246
 247 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 248     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 249
 250     toc = TOC()
 251     for element in chunk_xml[0]:
 252         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 253             toc.add(node_name(element), chunk_no)
 254         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 255             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 256             element.set('sub', str(subnumber))
 257     if empty:
 258         if not _empty_html_static:
 259             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 260         chars = set()
 261         output_html = _empty_html_static[0]
 262     else:
 263         find_annotations(annotations, chunk_xml, chunk_no)
 264         replace_by_verse(chunk_xml)
 265         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 266         chars = used_chars(html_tree.getroot())
 267         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 268     return output_html, toc, chars
 269
 270
 271 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
 272               sample=None, cover=None, flags=None):
 273     """ produces a EPUB file
 274
 275     provider: a DocProvider
 276     slug: slug of file to process, available by provider
 277     output_file: file-like object or path to output file
 278     output_dir: path to directory to save output file to; either this or output_file must be present
 279     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 280     sample=n: generate sample e-book (with at least n paragraphs)
 281     cover: a cover.Cover object
 282     flags: less-advertising, images, not-wl
 283     """
 284
 285     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
 286         """ processes one input file and proceeds to its children """
 287
 288         replace_characters(input_xml.getroot())
 289
 290         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 291
 292         # every input file will have a TOC entry,
 293         # pointing to starting chunk
 294         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 295         chars = set()
 296         if first:
 297             # write book title page
 298             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
 299             chars = used_chars(html_tree.getroot())
 300             zip.writestr('OPS/title.html',
 301                  etree.tostring(html_tree, method="html", pretty_print=True))
 302         elif children:
 303             # write title page for every parent
 304             if sample is not None and sample <= 0:
 305                 chars = set()
 306                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 307             else:
 308                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
 309                 chars = used_chars(html_tree.getroot())
 310                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 311             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 312             add_to_manifest(manifest, chunk_counter)
 313             add_to_spine(spine, chunk_counter)
 314             chunk_counter += 1
 315
 316         if len(input_xml.getroot()) > 1:
 317             # rdf before style master
 318             main_text = input_xml.getroot()[1]
 319         else:
 320             # rdf in style master
 321             main_text = input_xml.getroot()[0]
 322             if main_text.tag == RDFNS('RDF'):
 323                 main_text = None
 324
 325         if main_text is not None:
 326             for chunk_xml in chop(main_text):
 327                 empty = False
 328                 if sample is not None:
 329                     if sample <= 0:
 330                         empty = True
 331                     else:
 332                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 333                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 334
 335                 toc.extend(chunk_toc)
 336                 chars = chars.union(chunk_chars)
 337                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 338                 add_to_manifest(manifest, chunk_counter)
 339                 add_to_spine(spine, chunk_counter)
 340                 chunk_counter += 1
 341
 342         if children:
 343             for child in children:
 344                 child_xml = etree.parse(provider.by_uri(child))
 345                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
 346                 toc.append(child_toc)
 347                 chars = chars.union(chunk_chars)
 348
 349         return toc, chunk_counter, chars, sample
 350
 351     # read metadata from the first file
 352     if file_path:
 353         if slug:
 354             raise ValueError('slug or file_path should be specified, not both')
 355         f = open(file_path, 'r')
 356         input_xml = etree.parse(f)
 357         f.close()
 358     else:
 359         if not slug:
 360             raise ValueError('either slug or file_path should be specified')
 361         input_xml = etree.parse(provider[slug])
 362
 363     if flags:
 364         for flag in flags:
 365             input_xml.getroot().set(flag, 'yes')
 366
 367     metadata = input_xml.find('.//'+RDFNS('Description'))
 368     if metadata is None:
 369         raise NoDublinCore('Document has no DublinCore - which is required.')
 370     book_info = BookInfo.from_element(input_xml)
 371     metadata = etree.ElementTree(metadata)
 372
 373     # if output to dir, create the file
 374     if output_dir is not None:
 375         if make_dir:
 376             author = unicode(book_info.author)
 377             output_dir = os.path.join(output_dir, author)
 378             try:
 379                 os.makedirs(output_dir)
 380             except OSError:
 381                 pass
 382         if slug:
 383             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 384         else:
 385             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 386
 387     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
 388     manifest = opf.find('.//' + OPFNS('manifest'))
 389     spine = opf.find('.//' + OPFNS('spine'))
 390
 391     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 392
 393     # write static elements
 394     mime = zipfile.ZipInfo()
 395     mime.filename = 'mimetype'
 396     mime.compress_type = zipfile.ZIP_STORED
 397     mime.extra = ''
 398     zip.writestr(mime, 'application/epub+zip')
 399     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 400                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 401                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 402                        'media-type="application/oebps-package+xml" />' \
 403                        '</rootfiles></container>')
 404     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
 405     if not flags or 'not-wl' not in flags:
 406         manifest.append(etree.fromstring(
 407             '<item id="logo_wolnelektury" href="logo_wolnelektury.png" media-type="image/png" />'))
 408         zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 409
 410     if cover:
 411         cover_file = StringIO()
 412         c = cover(book_info.author.readable(), book_info.title)
 413         c.save(cover_file)
 414         c_name = 'cover.%s' % c.ext()
 415         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 416         del cover_file
 417
 418         cover_tree = etree.parse(get_resource('epub/cover.html'))
 419         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 420         zip.writestr('OPS/cover.html', etree.tostring(
 421                         cover_tree, method="html", pretty_print=True))
 422
 423         manifest.append(etree.fromstring(
 424             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 425         manifest.append(etree.fromstring(
 426             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 427         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
 428         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 429         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
 430
 431     if flags and 'images' in flags:
 432         for ilustr in input_xml.findall('//ilustr'):
 433             src = ilustr.get('src')
 434             mime = ImageCover(src)().mime_type()
 435             zip.write(src, os.path.join('OPS', src))
 436             manifest.append(etree.fromstring(
 437                 '<item id="%s" href="%s" media-type="%s" />' % (src, src, mime)))
 438             # get it up to master
 439             after = ilustr
 440             while after.getparent().tag not in ['powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp', 'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny']:
 441                 after = after.getparent()
 442             if not(after is ilustr):
 443                 moved = deepcopy(ilustr)
 444                 ilustr.tag = 'extra'
 445                 ilustr.text = None
 446                 moved.tail = None
 447                 after.addnext(moved)
 448     else:
 449         for ilustr in input_xml.findall('//ilustr'):
 450             ilustr.tag = 'extra'
 451
 452     annotations = etree.Element('annotations')
 453
 454     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 455                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 456                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 457                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 458                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 459                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 460                                '</navPoint></navMap></ncx>')
 461     nav_map = toc_file[-1]
 462
 463     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
 464
 465     if not toc.children:
 466         toc.add(u"Początek utworu", 1)
 467     toc_counter = toc.write_to_xml(nav_map, 2)
 468
 469     # Last modifications in container files and EPUB creation
 470     if len(annotations) > 0:
 471         nav_map.append(etree.fromstring(
 472             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 473             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 474         toc_counter += 1
 475         manifest.append(etree.fromstring(
 476             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 477         spine.append(etree.fromstring(
 478             '<itemref idref="annotations" />'))
 479         replace_by_verse(annotations)
 480         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 481         chars = chars.union(used_chars(html_tree.getroot()))
 482         zip.writestr('OPS/annotations.html', etree.tostring(
 483                             html_tree, method="html", pretty_print=True))
 484
 485     nav_map.append(etree.fromstring(
 486         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
 487         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
 488     manifest.append(etree.fromstring(
 489         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 490     spine.append(etree.fromstring(
 491         '<itemref idref="last" />'))
 492     stopka = input_xml.find('//stopka')
 493     if stopka is not None:
 494         stopka.tag = 'stopka_'
 495         replace_by_verse(stopka)
 496         html_tree = xslt(stopka, get_resource('epub/xsltScheme.xsl'))
 497     else:
 498         html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
 499     chars.update(used_chars(html_tree.getroot()))
 500     zip.writestr('OPS/last.html', etree.tostring(
 501                         html_tree, method="html", pretty_print=True))
 502
 503     # strip fonts
 504     tmpdir = mkdtemp('-librarian-epub')
 505     cwd = os.getcwd()
 506
 507     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 508     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 509         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 510                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 511         if verbose:
 512             print "Running font-optimizer"
 513             subprocess.check_call(optimizer_call)
 514         else:
 515             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 516         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 517     rmtree(tmpdir)
 518     os.chdir(cwd)
 519
 520     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 521     contents = []
 522     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 523     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 524     for st in attributes:
 525         meta = toc_file.makeelement(NCXNS('meta'))
 526         meta.set('name', st)
 527         meta.set('content', '0')
 528         toc_file[0].append(meta)
 529     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 530     toc_file[0][1].set('content', str(toc.depth()))
 531     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 532     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 533     zip.close()