librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from copy import deepcopy
  12 from lxml import etree
  13 import zipfile
  14 from tempfile import mkdtemp
  15 from shutil import rmtree
  16
  17 import sys
  18
  19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  20 from librarian.dcparser import BookInfo
  21
  22 from librarian import functions, get_resource
  23
  24 functions.reg_person_name()
  25
  26
  27 def inner_xml(node):
  28     """ returns node's text and children as a string
  29
  30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  31     x<b>y</b>z
  32     """
  33
  34     nt = node.text if node.text is not None else ''
  35     return ''.join([nt] + [etree.tostring(child) for child in node])
  36
  37 def set_inner_xml(node, text):
  38     """ sets node's text and children from a string
  39
  40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  41     >>> set_inner_xml(e, 'x<b>y</b>z')
  42     >>> print etree.tostring(e)
  43     <a>x<b>y</b>z</a>
  44     """
  45
  46     p = etree.fromstring('<x>%s</x>' % text)
  47     node.text = p.text
  48     node[:] = p[:]
  49
  50
  51 def node_name(node):
  52     """ Find out a node's name
  53
  54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  55     XYZ
  56     """
  57
  58     tempnode = deepcopy(node)
  59
  60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  61         for e in tempnode.findall('.//%s' % p):
  62             t = e.tail
  63             e.clear()
  64             e.tail = t
  65     etree.strip_tags(tempnode, '*')
  66     return tempnode.text
  67
  68
  69 def xslt(xml, sheet):
  70     if isinstance(xml, etree._Element):
  71         xml = etree.ElementTree(xml)
  72     with open(sheet) as xsltf:
  73         return xml.xslt(etree.parse(xsltf))
  74
  75
  76 def replace_characters(node):
  77     def replace_chars(text):
  78         if text is None:
  79             return None
  80         return text.replace(u"\ufeff", u"")\
  81                    .replace("---", u"\u2014")\
  82                    .replace("--", u"\u2013")\
  83                    .replace(",,", u"\u201E")\
  84                    .replace('"', u"\u201D")\
  85                    .replace("'", u"\u2019")
  86     if node.tag == 'extra':
  87         node.clear()
  88     else:
  89         node.text = replace_chars(node.text)
  90         node.tail = replace_chars(node.tail)
  91         for child in node:
  92             replace_characters(child)
  93
  94
  95 def find_annotations(annotations, source, part_no):
  96     for child in source:
  97         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  98             annotation = deepcopy(child)
  99             number = str(len(annotations)+1)
 100             annotation.set('number', number)
 101             annotation.set('part', str(part_no))
 102             annotation.tail = ''
 103             annotations.append(annotation)
 104             tail = child.tail
 105             child.clear()
 106             child.tail = tail
 107             child.text = number
 108         if child.tag not in ('extra', 'podtytul'):
 109             find_annotations(annotations, child, part_no)
 110
 111
 112 def replace_by_verse(tree):
 113     """ Find stanzas and create new verses in place of a '/' character """
 114
 115     stanzas = tree.findall('.//' + WLNS('strofa'))
 116     for node in stanzas:
 117         for child_node in node:
 118             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 119                 foreign_verses = inner_xml(child_node).split('/\n')
 120                 if len(foreign_verses) > 1:
 121                     new_foreign = ''
 122                     for foreign_verse in foreign_verses:
 123                         if foreign_verse.startswith('<wers'):
 124                             new_foreign += foreign_verse
 125                         else:
 126                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 127                     set_inner_xml(child_node, new_foreign)
 128         verses = inner_xml(node).split('/\n')
 129         if len(verses) > 1:
 130             modified_inner_xml = ''
 131             for verse in verses:
 132                 if verse.startswith('<wers') or verse.startswith('<extra'):
 133                     modified_inner_xml += verse
 134                 else:
 135                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 136             set_inner_xml(node, modified_inner_xml)
 137
 138
 139 def add_to_manifest(manifest, partno):
 140     """ Adds a node to the manifest section in content.opf file """
 141
 142     partstr = 'part%d' % partno
 143     e = manifest.makeelement(OPFNS('item'), attrib={
 144                                  'id': partstr,
 145                                  'href': partstr + '.html',
 146                                  'media-type': 'application/xhtml+xml',
 147                              })
 148     manifest.append(e)
 149
 150
 151 def add_to_spine(spine, partno):
 152     """ Adds a node to the spine section in content.opf file """
 153
 154     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 155     spine.append(e)
 156
 157
 158 class TOC(object):
 159     def __init__(self, name=None, part_number=None):
 160         self.children = []
 161         self.name = name
 162         self.part_number = part_number
 163         self.sub_number = None
 164
 165     def add(self, name, part_number, level=0, is_part=True):
 166         if level > 0 and self.children:
 167             return self.children[-1].add(name, part_number, level-1, is_part)
 168         else:
 169             t = TOC(name)
 170             t.part_number = part_number
 171             self.children.append(t)
 172             if not is_part:
 173                 t.sub_number = len(self.children) + 1
 174                 return t.sub_number
 175
 176     def append(self, toc):
 177         self.children.append(toc)
 178
 179     def extend(self, toc):
 180         self.children.extend(toc.children)
 181
 182     def depth(self):
 183         if self.children:
 184             return max((c.depth() for c in self.children)) + 1
 185         else:
 186             return 0
 187
 188     def write_to_xml(self, nav_map, counter):
 189         for child in self.children:
 190             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 191             nav_point.set('id', 'NavPoint-%d' % counter)
 192             nav_point.set('playOrder', str(counter))
 193
 194             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 195             text = nav_map.makeelement(NCXNS('text'))
 196             text.text = child.name
 197             nav_label.append(text)
 198             nav_point.append(nav_label)
 199
 200             content = nav_map.makeelement(NCXNS('content'))
 201             src = 'part%d.html' % child.part_number
 202             if child.sub_number is not None:
 203                 src += '#sub%d' % child.sub_number
 204             content.set('src', src)
 205             nav_point.append(content)
 206             nav_map.append(nav_point)
 207             counter = child.write_to_xml(nav_point, counter + 1)
 208         return counter
 209
 210
 211 def used_chars(element):
 212     """ Lists characters used in an ETree Element """
 213     chars = set((element.text or '') + (element.tail or ''))
 214     for child in element:
 215         chars = chars.union(used_chars(child))
 216     return chars
 217
 218
 219 def chop(main_text):
 220     """ divide main content of the XML file into chunks """
 221
 222     # prepare a container for each chunk
 223     part_xml = etree.Element('utwor')
 224     etree.SubElement(part_xml, 'master')
 225     main_xml_part = part_xml[0] # master
 226
 227     last_node_part = False
 228     for one_part in main_text:
 229         name = one_part.tag
 230         if name == 'naglowek_czesc':
 231             yield part_xml
 232             last_node_part = True
 233             main_xml_part[:] = [deepcopy(one_part)]
 234         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 235             yield part_xml
 236             main_xml_part[:] = [deepcopy(one_part)]
 237         else:
 238             main_xml_part.append(deepcopy(one_part))
 239             last_node_part = False
 240     yield part_xml
 241
 242
 243 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 244     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 245
 246     toc = TOC()
 247     for element in chunk_xml[0]:
 248         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 249             toc.add(node_name(element), chunk_no)
 250         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 251             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 252             element.set('sub', str(subnumber))
 253     if empty:
 254         if not _empty_html_static:
 255             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 256         chars = set()
 257         output_html = _empty_html_static[0]
 258     else:
 259         find_annotations(annotations, chunk_xml, chunk_no)
 260         replace_by_verse(chunk_xml)
 261         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 262         chars = used_chars(html_tree.getroot())
 263         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 264     return output_html, toc, chars
 265
 266
 267 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None):
 268     """ produces a EPUB file
 269
 270     provider: a DocProvider
 271     slug: slug of file to process, available by provider
 272     output_file: file-like object or path to output file
 273     output_dir: path to directory to save output file to; either this or output_file must be present
 274     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 275     sample=n: generate sample e-book (with at least n paragraphs)
 276     """
 277
 278     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
 279         """ processes one input file and proceeds to its children """
 280
 281         replace_characters(input_xml.getroot())
 282
 283         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 284
 285         # every input file will have a TOC entry,
 286         # pointing to starting chunk
 287         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 288         chars = set()
 289         if first:
 290             # write book title page
 291             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
 292             chars = used_chars(html_tree.getroot())
 293             zip.writestr('OPS/title.html',
 294                  etree.tostring(html_tree, method="html", pretty_print=True))
 295         elif children:
 296             # write title page for every parent
 297             if sample is not None and sample <= 0:
 298                 chars = set()
 299                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 300             else:
 301                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
 302                 chars = used_chars(html_tree.getroot())
 303                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 304             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 305             add_to_manifest(manifest, chunk_counter)
 306             add_to_spine(spine, chunk_counter)
 307             chunk_counter += 1
 308
 309         if len(input_xml.getroot()) > 1:
 310             # rdf before style master
 311             main_text = input_xml.getroot()[1]
 312         else:
 313             # rdf in style master
 314             main_text = input_xml.getroot()[0]
 315             if main_text.tag == RDFNS('RDF'):
 316                 main_text = None
 317
 318         if main_text is not None:
 319             for chunk_xml in chop(main_text):
 320                 empty = False
 321                 if sample is not None:
 322                     if sample <= 0:
 323                         empty = True
 324                     else:
 325                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 326                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 327
 328                 toc.extend(chunk_toc)
 329                 chars = chars.union(chunk_chars)
 330                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 331                 add_to_manifest(manifest, chunk_counter)
 332                 add_to_spine(spine, chunk_counter)
 333                 chunk_counter += 1
 334
 335         if children:
 336             for child in children:
 337                 child_xml = etree.parse(provider.by_uri(child))
 338                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
 339                 toc.append(child_toc)
 340                 chars = chars.union(chunk_chars)
 341
 342         return toc, chunk_counter, chars, sample
 343
 344     # read metadata from the first file
 345     if file_path:
 346         if slug:
 347             raise ValueError('slug or file_path should be specified, not both')
 348         f = open(file_path, 'r')
 349         input_xml = etree.parse(f)
 350         f.close()
 351     else:
 352         if not slug:
 353             raise ValueError('either slug or file_path should be specified')
 354         input_xml = etree.parse(provider[slug])
 355
 356     metadata = input_xml.find('.//'+RDFNS('Description'))
 357     if metadata is None:
 358         raise NoDublinCore('Document has no DublinCore - which is required.')
 359     book_info = BookInfo.from_element(input_xml)
 360     metadata = etree.ElementTree(metadata)
 361
 362     # if output to dir, create the file
 363     if output_dir is not None:
 364         if make_dir:
 365             author = unicode(book_info.author)
 366             output_dir = os.path.join(output_dir, author)
 367             try:
 368                 os.makedirs(output_dir)
 369             except OSError:
 370                 pass
 371         if slug:
 372             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 373         else:
 374             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 375
 376     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 377
 378     # write static elements
 379     mime = zipfile.ZipInfo()
 380     mime.filename = 'mimetype'
 381     mime.compress_type = zipfile.ZIP_STORED
 382     mime.extra = ''
 383     zip.writestr(mime, 'application/epub+zip')
 384     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 385                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 386                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 387                        'media-type="application/oebps-package+xml" />' \
 388                        '</rootfiles></container>')
 389     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
 390     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 391
 392     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
 393     manifest = opf.find('.//' + OPFNS('manifest'))
 394     spine = opf.find('.//' + OPFNS('spine'))
 395
 396     annotations = etree.Element('annotations')
 397
 398     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 399                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 400                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 401                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 402                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 403                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 404                                '</navPoint></navMap></ncx>')
 405     nav_map = toc_file[-1]
 406
 407     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
 408
 409     if not toc.children:
 410         toc.add(u"Początek utworu", 1)
 411     toc_counter = toc.write_to_xml(nav_map, 2)
 412
 413     # Last modifications in container files and EPUB creation
 414     if len(annotations) > 0:
 415         nav_map.append(etree.fromstring(
 416             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 417             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 418         manifest.append(etree.fromstring(
 419             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 420         spine.append(etree.fromstring(
 421             '<itemref idref="annotations" />'))
 422         replace_by_verse(annotations)
 423         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 424         chars = chars.union(used_chars(html_tree.getroot()))
 425         zip.writestr('OPS/annotations.html', etree.tostring(
 426                             html_tree, method="html", pretty_print=True))
 427
 428     # strip fonts
 429     tmpdir = mkdtemp('-librarian-epub')
 430     cwd = os.getcwd()
 431
 432     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 433     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 434         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 435                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 436         if verbose:
 437             print "Running font-optimizer"
 438             subprocess.check_call(optimizer_call)
 439         else:
 440             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 441         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 442     rmtree(tmpdir)
 443     os.chdir(cwd)
 444
 445     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 446     contents = []
 447     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 448     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 449     for st in attributes:
 450         meta = toc_file.makeelement(NCXNS('meta'))
 451         meta.set('name', st)
 452         meta.set('content', '0')
 453         toc_file[0].append(meta)
 454     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 455     toc_file[0][1].set('content', str(toc.depth()))
 456     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 457     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 458     zip.close()