librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from copy import deepcopy
  12 from lxml import etree
  13 import zipfile
  14 from tempfile import mkdtemp
  15 from shutil import rmtree
  16
  17 import sys
  18
  19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  20 from librarian.dcparser import BookInfo
  21
  22 from librarian import functions
  23
  24 functions.reg_person_name()
  25
  26
  27 def inner_xml(node):
  28     """ returns node's text and children as a string
  29
  30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  31     x<b>y</b>z
  32     """
  33
  34     nt = node.text if node.text is not None else ''
  35     return ''.join([nt] + [etree.tostring(child) for child in node])
  36
  37 def set_inner_xml(node, text):
  38     """ sets node's text and children from a string
  39
  40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  41     >>> set_inner_xml(e, 'x<b>y</b>z')
  42     >>> print etree.tostring(e)
  43     <a>x<b>y</b>z</a>
  44     """
  45
  46     p = etree.fromstring('<x>%s</x>' % text)
  47     node.text = p.text
  48     node[:] = p[:]
  49
  50
  51 def node_name(node):
  52     """ Find out a node's name
  53
  54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  55     XYZ
  56     """
  57
  58     tempnode = deepcopy(node)
  59
  60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  61         for e in tempnode.findall('.//%s' % p):
  62             t = e.tail
  63             e.clear()
  64             e.tail = t
  65     etree.strip_tags(tempnode, '*')
  66     return tempnode.text
  67
  68
  69 def xslt(xml, sheet):
  70     if isinstance(xml, etree._Element):
  71         xml = etree.ElementTree(xml)
  72     with open(sheet) as xsltf:
  73         return xml.xslt(etree.parse(xsltf))
  74
  75
  76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
  77 def res(fname):
  78     return os.path.join(_resdir, fname)
  79
  80
  81 def replace_characters(node):
  82     def replace_chars(text):
  83         if text is None:
  84             return None
  85         return text.replace("---", u"\u2014")\
  86                    .replace("--", u"\u2013")\
  87                    .replace(",,", u"\u201E")\
  88                    .replace('"', u"\u201D")\
  89                    .replace("'", u"\u2019")
  90     if node.tag == 'extra':
  91         node.clear()
  92     else:
  93         node.text = replace_chars(node.text)
  94         node.tail = replace_chars(node.tail)
  95         for child in node:
  96             replace_characters(child)
  97
  98
  99 def find_annotations(annotations, source, part_no):
 100     for child in source:
 101         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 102             annotation = deepcopy(child)
 103             number = str(len(annotations)+1)
 104             annotation.set('number', number)
 105             annotation.set('part', str(part_no))
 106             annotation.tail = ''
 107             annotations.append(annotation)
 108             tail = child.tail
 109             child.clear()
 110             child.tail = tail
 111             child.text = number
 112         if child.tag not in ('extra', 'podtytul'):
 113             find_annotations(annotations, child, part_no)
 114
 115
 116 def replace_by_verse(tree):
 117     """ Find stanzas and create new verses in place of a '/' character """
 118
 119     stanzas = tree.findall('.//' + WLNS('strofa'))
 120     for node in stanzas:
 121         for child_node in node:
 122             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 123                 foreign_verses = inner_xml(child_node).split('/\n')
 124                 if len(foreign_verses) > 1:
 125                     new_foreign = ''
 126                     for foreign_verse in foreign_verses:
 127                         if foreign_verse.startswith('<wers'):
 128                             new_foreign += foreign_verse
 129                         else:
 130                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 131                     set_inner_xml(child_node, new_foreign)
 132         verses = inner_xml(node).split('/\n')
 133         if len(verses) > 1:
 134             modified_inner_xml = ''
 135             for verse in verses:
 136                 if verse.startswith('<wers') or verse.startswith('<extra'):
 137                     modified_inner_xml += verse
 138                 else:
 139                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 140             set_inner_xml(node, modified_inner_xml)
 141
 142
 143 def add_to_manifest(manifest, partno):
 144     """ Adds a node to the manifest section in content.opf file """
 145
 146     partstr = 'part%d' % partno
 147     e = manifest.makeelement(OPFNS('item'), attrib={
 148                                  'id': partstr,
 149                                  'href': partstr + '.html',
 150                                  'media-type': 'application/xhtml+xml',
 151                              })
 152     manifest.append(e)
 153
 154
 155 def add_to_spine(spine, partno):
 156     """ Adds a node to the spine section in content.opf file """
 157
 158     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 159     spine.append(e)
 160
 161
 162 class TOC(object):
 163     def __init__(self, name=None, part_number=None):
 164         self.children = []
 165         self.name = name
 166         self.part_number = part_number
 167         self.sub_number = None
 168
 169     def add(self, name, part_number, level=0, is_part=True):
 170         if level > 0 and self.children:
 171             return self.children[-1].add(name, part_number, level-1, is_part)
 172         else:
 173             t = TOC(name)
 174             t.part_number = part_number
 175             self.children.append(t)
 176             if not is_part:
 177                 t.sub_number = len(self.children) + 1
 178                 return t.sub_number
 179
 180     def append(self, toc):
 181         self.children.append(toc)
 182
 183     def extend(self, toc):
 184         self.children.extend(toc.children)
 185
 186     def depth(self):
 187         if self.children:
 188             return max((c.depth() for c in self.children)) + 1
 189         else:
 190             return 0
 191
 192     def write_to_xml(self, nav_map, counter):
 193         for child in self.children:
 194             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 195             nav_point.set('id', 'NavPoint-%d' % counter)
 196             nav_point.set('playOrder', str(counter))
 197
 198             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 199             text = nav_map.makeelement(NCXNS('text'))
 200             text.text = child.name
 201             nav_label.append(text)
 202             nav_point.append(nav_label)
 203
 204             content = nav_map.makeelement(NCXNS('content'))
 205             src = 'part%d.html' % child.part_number
 206             if child.sub_number is not None:
 207                 src += '#sub%d' % child.sub_number
 208             content.set('src', src)
 209             nav_point.append(content)
 210             nav_map.append(nav_point)
 211             counter = child.write_to_xml(nav_point, counter + 1)
 212         return counter
 213
 214
 215 def used_chars(element):
 216     """ Lists characters used in an ETree Element """
 217     chars = set((element.text or '') + (element.tail or ''))
 218     for child in element:
 219         chars = chars.union(used_chars(child))
 220     return chars
 221
 222
 223 def chop(main_text):
 224     """ divide main content of the XML file into chunks """
 225
 226     # prepare a container for each chunk
 227     part_xml = etree.Element('utwor')
 228     etree.SubElement(part_xml, 'master')
 229     main_xml_part = part_xml[0] # master
 230
 231     last_node_part = False
 232     for one_part in main_text:
 233         name = one_part.tag
 234         if name == 'naglowek_czesc':
 235             yield part_xml
 236             last_node_part = True
 237             main_xml_part[:] = [deepcopy(one_part)]
 238         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 239             yield part_xml
 240             main_xml_part[:] = [deepcopy(one_part)]
 241         else:
 242             main_xml_part.append(deepcopy(one_part))
 243             last_node_part = False
 244     yield part_xml
 245
 246
 247 def transform_chunk(chunk_xml, chunk_no, annotations):
 248     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 249
 250     toc = TOC()
 251     for element in chunk_xml[0]:
 252         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 253             toc.add(node_name(element), chunk_no)
 254         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 255             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 256             element.set('sub', str(subnumber))
 257     find_annotations(annotations, chunk_xml, chunk_no)
 258     replace_by_verse(chunk_xml)
 259     html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
 260     chars = used_chars(html_tree.getroot())
 261     output_html = etree.tostring(html_tree, pretty_print=True)
 262     return output_html, toc, chars
 263
 264
 265 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False):
 266     """ produces a EPUB file
 267
 268     provider: a DocProvider
 269     slug: slug of file to process, available by provider
 270     output_file: file-like object or path to output file
 271     output_dir: path to directory to save output file to; either this or output_file must be present
 272     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 273     """
 274
 275     def transform_file(input_xml, chunk_counter=1, first=True):
 276         """ processes one input file and proceeds to its children """
 277
 278         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 279
 280         # every input file will have a TOC entry,
 281         # pointing to starting chunk
 282         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 283         chars = set()
 284         if first:
 285             # write book title page
 286             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
 287             chars = used_chars(html_tree.getroot())
 288             zip.writestr('OPS/title.html',
 289                  etree.tostring(html_tree, pretty_print=True))
 290         elif children:
 291             # write title page for every parent
 292             html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
 293             chars = used_chars(html_tree.getroot())
 294             zip.writestr('OPS/part%d.html' % chunk_counter,
 295                 etree.tostring(html_tree, pretty_print=True))
 296             add_to_manifest(manifest, chunk_counter)
 297             add_to_spine(spine, chunk_counter)
 298             chunk_counter += 1
 299
 300         if len(input_xml.getroot()) > 1:
 301             # rdf before style master
 302             main_text = input_xml.getroot()[1]
 303         else:
 304             # rdf in style master
 305             main_text = input_xml.getroot()[0]
 306             if main_text.tag == RDFNS('RDF'):
 307                 main_text = None
 308
 309         if main_text is not None:
 310             replace_characters(main_text)
 311
 312             for chunk_xml in chop(main_text):
 313                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
 314                 toc.extend(chunk_toc)
 315                 chars = chars.union(chunk_chars)
 316                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 317                 add_to_manifest(manifest, chunk_counter)
 318                 add_to_spine(spine, chunk_counter)
 319                 chunk_counter += 1
 320
 321         if children:
 322             for child in children:
 323                 child_xml = etree.parse(provider.by_uri(child))
 324                 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
 325                 toc.append(child_toc)
 326                 chars = chars.union(chunk_chars)
 327
 328         return toc, chunk_counter, chars
 329
 330     # read metadata from the first file
 331     if file_path:
 332         if slug:
 333             raise ValueError('slug or file_path should be specified, not both')
 334         f = open(file_path, 'r')
 335         input_xml = etree.parse(f)
 336         f.close()
 337     else:
 338         if not slug:
 339             raise ValueError('either slug or file_path should be specified')
 340         input_xml = etree.parse(provider[slug])
 341
 342     metadata = input_xml.find('.//'+RDFNS('Description'))
 343     if metadata is None:
 344         raise NoDublinCore('Document has no DublinCore - which is required.')
 345     book_info = BookInfo.from_element(input_xml)
 346     metadata = etree.ElementTree(metadata)
 347
 348     # if output to dir, create the file
 349     if output_dir is not None:
 350         if make_dir:
 351             author = unicode(book_info.author)
 352             output_dir = os.path.join(output_dir, author)
 353             try:
 354                 os.makedirs(output_dir)
 355             except OSError:
 356                 pass
 357         if slug:
 358             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 359         else:
 360             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 361
 362     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 363
 364     # write static elements
 365     mime = zipfile.ZipInfo()
 366     mime.filename = 'mimetype'
 367     mime.compress_type = zipfile.ZIP_STORED
 368     mime.extra = ''
 369     zip.writestr(mime, 'application/epub+zip')
 370     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 371                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 372                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 373                        'media-type="application/oebps-package+xml" />' \
 374                        '</rootfiles></container>')
 375     for fname in 'style.css', 'logo_wolnelektury.png':
 376         zip.write(res(fname), os.path.join('OPS', fname))
 377
 378     opf = xslt(metadata, res('xsltContent.xsl'))
 379     manifest = opf.find('.//' + OPFNS('manifest'))
 380     spine = opf.find('.//' + OPFNS('spine'))
 381
 382     annotations = etree.Element('annotations')
 383
 384     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 385                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 386                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 387                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 388                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 389                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 390                                '</navPoint></navMap></ncx>')
 391     nav_map = toc_file[-1]
 392
 393     toc, chunk_counter, chars = transform_file(input_xml)
 394
 395     if not toc.children:
 396         toc.add(u"Początek utworu", 1)
 397     toc_counter = toc.write_to_xml(nav_map, 2)
 398
 399     # Last modifications in container files and EPUB creation
 400     if len(annotations) > 0:
 401         nav_map.append(etree.fromstring(
 402             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 403             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 404         manifest.append(etree.fromstring(
 405             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 406         spine.append(etree.fromstring(
 407             '<itemref idref="annotations" />'))
 408         replace_by_verse(annotations)
 409         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
 410         chars = chars.union(used_chars(html_tree.getroot()))
 411         zip.writestr('OPS/annotations.html', etree.tostring(
 412                             html_tree, pretty_print=True))
 413
 414     # strip fonts
 415     tmpdir = mkdtemp('-librarian-epub')
 416     cwd = os.getcwd()
 417
 418     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 419     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 420         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)]
 421         if verbose:
 422             print "Running font-optimizer"
 423             subprocess.check_call(optimizer_call)
 424         else:
 425             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 426         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 427     rmtree(tmpdir)
 428     os.chdir(cwd)
 429
 430     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 431     contents = []
 432     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 433     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 434     for st in attributes:
 435         meta = toc_file.makeelement(NCXNS('meta'))
 436         meta.set('name', st)
 437         meta.set('content', '0')
 438         toc_file[0].append(meta)
 439     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 440     toc_file[0][1].set('content', str(toc.depth()))
 441     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 442     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 443     zip.close()