librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp
  16 from shutil import rmtree
  17
  18 import sys
  19
  20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
  21 from librarian.dcparser import BookInfo
  22
  23 from librarian import functions, get_resource
  24
  25 functions.reg_person_name()
  26
  27
  28 def inner_xml(node):
  29     """ returns node's text and children as a string
  30
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  32     x<b>y</b>z
  33     """
  34
  35     nt = node.text if node.text is not None else ''
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
  37
  38 def set_inner_xml(node, text):
  39     """ sets node's text and children from a string
  40
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
  43     >>> print etree.tostring(e)
  44     <a>x<b>y</b>z</a>
  45     """
  46
  47     p = etree.fromstring('<x>%s</x>' % text)
  48     node.text = p.text
  49     node[:] = p[:]
  50
  51
  52 def node_name(node):
  53     """ Find out a node's name
  54
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  56     XYZ
  57     """
  58
  59     tempnode = deepcopy(node)
  60
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  62         for e in tempnode.findall('.//%s' % p):
  63             t = e.tail
  64             e.clear()
  65             e.tail = t
  66     etree.strip_tags(tempnode, '*')
  67     return tempnode.text
  68
  69
  70 def xslt(xml, sheet):
  71     if isinstance(xml, etree._Element):
  72         xml = etree.ElementTree(xml)
  73     with open(sheet) as xsltf:
  74         return xml.xslt(etree.parse(xsltf))
  75
  76
  77 def replace_characters(node):
  78     def replace_chars(text):
  79         if text is None:
  80             return None
  81         return text.replace(u"\ufeff", u"")\
  82                    .replace("---", u"\u2014")\
  83                    .replace("--", u"\u2013")\
  84                    .replace(",,", u"\u201E")\
  85                    .replace('"', u"\u201D")\
  86                    .replace("'", u"\u2019")
  87     node.text = replace_chars(node.text)
  88     node.tail = replace_chars(node.tail)
  89     for child in node:
  90         replace_characters(child)
  91
  92
  93 def find_annotations(annotations, source, part_no):
  94     for child in source:
  95         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  96             annotation = deepcopy(child)
  97             number = str(len(annotations)+1)
  98             annotation.set('number', number)
  99             annotation.set('part', str(part_no))
 100             annotation.tail = ''
 101             annotations.append(annotation)
 102             tail = child.tail
 103             child.clear()
 104             child.tail = tail
 105             child.text = number
 106         if child.tag not in ('extra', 'uwaga'):
 107             find_annotations(annotations, child, part_no)
 108
 109
 110 def replace_by_verse(tree):
 111     """ Find stanzas and create new verses in place of a '/' character """
 112
 113     stanzas = tree.findall('.//' + WLNS('strofa'))
 114     for node in stanzas:
 115         for child_node in node:
 116             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 117                 foreign_verses = inner_xml(child_node).split('/\n')
 118                 if len(foreign_verses) > 1:
 119                     new_foreign = ''
 120                     for foreign_verse in foreign_verses:
 121                         if foreign_verse.startswith('<wers'):
 122                             new_foreign += foreign_verse
 123                         else:
 124                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 125                     set_inner_xml(child_node, new_foreign)
 126         verses = inner_xml(node).split('/\n')
 127         if len(verses) > 1:
 128             modified_inner_xml = ''
 129             for verse in verses:
 130                 if verse.startswith('<wers') or verse.startswith('<extra'):
 131                     modified_inner_xml += verse
 132                 else:
 133                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 134             set_inner_xml(node, modified_inner_xml)
 135
 136
 137 def add_to_manifest(manifest, partno):
 138     """ Adds a node to the manifest section in content.opf file """
 139
 140     partstr = 'part%d' % partno
 141     e = manifest.makeelement(OPFNS('item'), attrib={
 142                                  'id': partstr,
 143                                  'href': partstr + '.html',
 144                                  'media-type': 'application/xhtml+xml',
 145                              })
 146     manifest.append(e)
 147
 148
 149 def add_to_spine(spine, partno):
 150     """ Adds a node to the spine section in content.opf file """
 151
 152     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 153     spine.append(e)
 154
 155
 156 class TOC(object):
 157     def __init__(self, name=None, part_number=None):
 158         self.children = []
 159         self.name = name
 160         self.part_number = part_number
 161         self.sub_number = None
 162
 163     def add(self, name, part_number, level=0, is_part=True):
 164         if level > 0 and self.children:
 165             return self.children[-1].add(name, part_number, level-1, is_part)
 166         else:
 167             t = TOC(name)
 168             t.part_number = part_number
 169             self.children.append(t)
 170             if not is_part:
 171                 t.sub_number = len(self.children) + 1
 172                 return t.sub_number
 173
 174     def append(self, toc):
 175         self.children.append(toc)
 176
 177     def extend(self, toc):
 178         self.children.extend(toc.children)
 179
 180     def depth(self):
 181         if self.children:
 182             return max((c.depth() for c in self.children)) + 1
 183         else:
 184             return 0
 185
 186     def write_to_xml(self, nav_map, counter):
 187         for child in self.children:
 188             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 189             nav_point.set('id', 'NavPoint-%d' % counter)
 190             nav_point.set('playOrder', str(counter))
 191
 192             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 193             text = nav_map.makeelement(NCXNS('text'))
 194             text.text = child.name
 195             nav_label.append(text)
 196             nav_point.append(nav_label)
 197
 198             content = nav_map.makeelement(NCXNS('content'))
 199             src = 'part%d.html' % child.part_number
 200             if child.sub_number is not None:
 201                 src += '#sub%d' % child.sub_number
 202             content.set('src', src)
 203             nav_point.append(content)
 204             nav_map.append(nav_point)
 205             counter = child.write_to_xml(nav_point, counter + 1)
 206         return counter
 207
 208
 209 def used_chars(element):
 210     """ Lists characters used in an ETree Element """
 211     chars = set((element.text or '') + (element.tail or ''))
 212     for child in element:
 213         chars = chars.union(used_chars(child))
 214     return chars
 215
 216
 217 def chop(main_text):
 218     """ divide main content of the XML file into chunks """
 219
 220     # prepare a container for each chunk
 221     part_xml = etree.Element('utwor')
 222     etree.SubElement(part_xml, 'master')
 223     main_xml_part = part_xml[0] # master
 224
 225     last_node_part = False
 226     for one_part in main_text:
 227         name = one_part.tag
 228         if name == 'naglowek_czesc':
 229             yield part_xml
 230             last_node_part = True
 231             main_xml_part[:] = [deepcopy(one_part)]
 232         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 233             yield part_xml
 234             main_xml_part[:] = [deepcopy(one_part)]
 235         else:
 236             main_xml_part.append(deepcopy(one_part))
 237             last_node_part = False
 238     yield part_xml
 239
 240
 241 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 242     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 243
 244     toc = TOC()
 245     for element in chunk_xml[0]:
 246         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 247             toc.add(node_name(element), chunk_no)
 248         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 249             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 250             element.set('sub', str(subnumber))
 251     if empty:
 252         if not _empty_html_static:
 253             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 254         chars = set()
 255         output_html = _empty_html_static[0]
 256     else:
 257         find_annotations(annotations, chunk_xml, chunk_no)
 258         replace_by_verse(chunk_xml)
 259         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 260         chars = used_chars(html_tree.getroot())
 261         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 262     return output_html, toc, chars
 263
 264
 265 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
 266               sample=None, cover=None, flags=None):
 267     """ produces a EPUB file
 268
 269     provider: a DocProvider
 270     slug: slug of file to process, available by provider
 271     output_file: file-like object or path to output file
 272     output_dir: path to directory to save output file to; either this or output_file must be present
 273     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 274     sample=n: generate sample e-book (with at least n paragraphs)
 275     cover: a cover.Cover object
 276     flags: less-advertising,
 277     """
 278
 279     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
 280         """ processes one input file and proceeds to its children """
 281
 282         replace_characters(input_xml.getroot())
 283
 284         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 285
 286         # every input file will have a TOC entry,
 287         # pointing to starting chunk
 288         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 289         chars = set()
 290         if first:
 291             # write book title page
 292             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
 293             chars = used_chars(html_tree.getroot())
 294             zip.writestr('OPS/title.html',
 295                  etree.tostring(html_tree, method="html", pretty_print=True))
 296         elif children:
 297             # write title page for every parent
 298             if sample is not None and sample <= 0:
 299                 chars = set()
 300                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 301             else:
 302                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
 303                 chars = used_chars(html_tree.getroot())
 304                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 305             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 306             add_to_manifest(manifest, chunk_counter)
 307             add_to_spine(spine, chunk_counter)
 308             chunk_counter += 1
 309
 310         if len(input_xml.getroot()) > 1:
 311             # rdf before style master
 312             main_text = input_xml.getroot()[1]
 313         else:
 314             # rdf in style master
 315             main_text = input_xml.getroot()[0]
 316             if main_text.tag == RDFNS('RDF'):
 317                 main_text = None
 318
 319         if main_text is not None:
 320             for chunk_xml in chop(main_text):
 321                 empty = False
 322                 if sample is not None:
 323                     if sample <= 0:
 324                         empty = True
 325                     else:
 326                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 327                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 328
 329                 toc.extend(chunk_toc)
 330                 chars = chars.union(chunk_chars)
 331                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 332                 add_to_manifest(manifest, chunk_counter)
 333                 add_to_spine(spine, chunk_counter)
 334                 chunk_counter += 1
 335
 336         if children:
 337             for child in children:
 338                 child_xml = etree.parse(provider.by_uri(child))
 339                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
 340                 toc.append(child_toc)
 341                 chars = chars.union(chunk_chars)
 342
 343         return toc, chunk_counter, chars, sample
 344
 345     # read metadata from the first file
 346     if file_path:
 347         if slug:
 348             raise ValueError('slug or file_path should be specified, not both')
 349         f = open(file_path, 'r')
 350         input_xml = etree.parse(f)
 351         f.close()
 352     else:
 353         if not slug:
 354             raise ValueError('either slug or file_path should be specified')
 355         input_xml = etree.parse(provider[slug])
 356
 357     if flags:
 358         for flag in flags:
 359             input_xml.getroot().set(flag, 'yes')
 360
 361     metadata = input_xml.find('.//'+RDFNS('Description'))
 362     if metadata is None:
 363         raise NoDublinCore('Document has no DublinCore - which is required.')
 364     book_info = BookInfo.from_element(input_xml)
 365     metadata = etree.ElementTree(metadata)
 366
 367     # if output to dir, create the file
 368     if output_dir is not None:
 369         if make_dir:
 370             author = unicode(book_info.author)
 371             output_dir = os.path.join(output_dir, author)
 372             try:
 373                 os.makedirs(output_dir)
 374             except OSError:
 375                 pass
 376         if slug:
 377             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 378         else:
 379             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 380
 381     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 382
 383     # write static elements
 384     mime = zipfile.ZipInfo()
 385     mime.filename = 'mimetype'
 386     mime.compress_type = zipfile.ZIP_STORED
 387     mime.extra = ''
 388     zip.writestr(mime, 'application/epub+zip')
 389     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 390                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 391                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 392                        'media-type="application/oebps-package+xml" />' \
 393                        '</rootfiles></container>')
 394     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
 395     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 396
 397     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
 398     manifest = opf.find('.//' + OPFNS('manifest'))
 399     spine = opf.find('.//' + OPFNS('spine'))
 400
 401     if cover:
 402         cover_file = StringIO()
 403         c = cover(book_info.author.readable(), book_info.title)
 404         c.save(cover_file)
 405         c_name = 'cover.%s' % c.ext()
 406         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 407         del cover_file
 408
 409         cover_tree = etree.parse(get_resource('epub/cover.html'))
 410         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 411         zip.writestr('OPS/cover.html', etree.tostring(
 412                         cover_tree, method="html", pretty_print=True))
 413
 414         manifest.append(etree.fromstring(
 415             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 416         manifest.append(etree.fromstring(
 417             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 418         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
 419         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 420         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
 421
 422
 423     annotations = etree.Element('annotations')
 424
 425     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 426                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 427                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 428                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 429                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 430                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 431                                '</navPoint></navMap></ncx>')
 432     nav_map = toc_file[-1]
 433
 434     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
 435
 436     if not toc.children:
 437         toc.add(u"Początek utworu", 1)
 438     toc_counter = toc.write_to_xml(nav_map, 2)
 439
 440     # Last modifications in container files and EPUB creation
 441     if len(annotations) > 0:
 442         nav_map.append(etree.fromstring(
 443             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 444             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 445         toc_counter += 1
 446         manifest.append(etree.fromstring(
 447             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 448         spine.append(etree.fromstring(
 449             '<itemref idref="annotations" />'))
 450         replace_by_verse(annotations)
 451         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 452         chars = chars.union(used_chars(html_tree.getroot()))
 453         zip.writestr('OPS/annotations.html', etree.tostring(
 454                             html_tree, method="html", pretty_print=True))
 455
 456     nav_map.append(etree.fromstring(
 457         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
 458         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
 459     manifest.append(etree.fromstring(
 460         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 461     spine.append(etree.fromstring(
 462         '<itemref idref="last" />'))
 463     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
 464     chars.update(used_chars(html_tree.getroot()))
 465     zip.writestr('OPS/last.html', etree.tostring(
 466                         html_tree, method="html", pretty_print=True))
 467
 468     # strip fonts
 469     tmpdir = mkdtemp('-librarian-epub')
 470     cwd = os.getcwd()
 471
 472     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 473     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 474         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 475                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 476         if verbose:
 477             print "Running font-optimizer"
 478             subprocess.check_call(optimizer_call)
 479         else:
 480             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 481         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 482     rmtree(tmpdir)
 483     os.chdir(cwd)
 484
 485     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 486     contents = []
 487     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 488     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 489     for st in attributes:
 490         meta = toc_file.makeelement(NCXNS('meta'))
 491         meta.set('name', st)
 492         meta.set('content', '0')
 493         toc_file[0].append(meta)
 494     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 495     toc_file[0][1].set('content', str(toc.depth()))
 496     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 497     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 498     zip.close()