librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from copy import deepcopy
  12 from lxml import etree
  13 import zipfile
  14 from tempfile import mkdtemp
  15 from shutil import rmtree
  16
  17 import sys
  18
  19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  20 from librarian.dcparser import BookInfo
  21
  22 from librarian import functions
  23
  24 functions.reg_person_name()
  25
  26
  27 def inner_xml(node):
  28     """ returns node's text and children as a string
  29
  30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  31     x<b>y</b>z
  32     """
  33
  34     nt = node.text if node.text is not None else ''
  35     return ''.join([nt] + [etree.tostring(child) for child in node])
  36
  37 def set_inner_xml(node, text):
  38     """ sets node's text and children from a string
  39
  40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  41     >>> set_inner_xml(e, 'x<b>y</b>z')
  42     >>> print etree.tostring(e)
  43     <a>x<b>y</b>z</a>
  44     """
  45
  46     p = etree.fromstring('<x>%s</x>' % text)
  47     node.text = p.text
  48     node[:] = p[:]
  49
  50
  51 def node_name(node):
  52     """ Find out a node's name
  53
  54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  55     XYZ
  56     """
  57
  58     tempnode = deepcopy(node)
  59
  60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  61         for e in tempnode.findall('.//%s' % p):
  62             t = e.tail
  63             e.clear()
  64             e.tail = t
  65     etree.strip_tags(tempnode, '*')
  66     return tempnode.text
  67
  68
  69 def xslt(xml, sheet):
  70     if isinstance(xml, etree._Element):
  71         xml = etree.ElementTree(xml)
  72     with open(sheet) as xsltf:
  73         return xml.xslt(etree.parse(xsltf))
  74
  75
  76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
  77 def res(fname):
  78     return os.path.join(_resdir, fname)
  79
  80
  81 def replace_characters(node):
  82     def replace_chars(text):
  83         if text is None:
  84             return None
  85         return text.replace(u"\ufeff", u"")\
  86                    .replace("---", u"\u2014")\
  87                    .replace("--", u"\u2013")\
  88                    .replace(",,", u"\u201E")\
  89                    .replace('"', u"\u201D")\
  90                    .replace("'", u"\u2019")
  91     if node.tag == 'extra':
  92         node.clear()
  93     else:
  94         node.text = replace_chars(node.text)
  95         node.tail = replace_chars(node.tail)
  96         for child in node:
  97             replace_characters(child)
  98
  99
 100 def find_annotations(annotations, source, part_no):
 101     for child in source:
 102         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 103             annotation = deepcopy(child)
 104             number = str(len(annotations)+1)
 105             annotation.set('number', number)
 106             annotation.set('part', str(part_no))
 107             annotation.tail = ''
 108             annotations.append(annotation)
 109             tail = child.tail
 110             child.clear()
 111             child.tail = tail
 112             child.text = number
 113         if child.tag not in ('extra', 'podtytul'):
 114             find_annotations(annotations, child, part_no)
 115
 116
 117 def replace_by_verse(tree):
 118     """ Find stanzas and create new verses in place of a '/' character """
 119
 120     stanzas = tree.findall('.//' + WLNS('strofa'))
 121     for node in stanzas:
 122         for child_node in node:
 123             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 124                 foreign_verses = inner_xml(child_node).split('/\n')
 125                 if len(foreign_verses) > 1:
 126                     new_foreign = ''
 127                     for foreign_verse in foreign_verses:
 128                         if foreign_verse.startswith('<wers'):
 129                             new_foreign += foreign_verse
 130                         else:
 131                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 132                     set_inner_xml(child_node, new_foreign)
 133         verses = inner_xml(node).split('/\n')
 134         if len(verses) > 1:
 135             modified_inner_xml = ''
 136             for verse in verses:
 137                 if verse.startswith('<wers') or verse.startswith('<extra'):
 138                     modified_inner_xml += verse
 139                 else:
 140                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 141             set_inner_xml(node, modified_inner_xml)
 142
 143
 144 def add_to_manifest(manifest, partno):
 145     """ Adds a node to the manifest section in content.opf file """
 146
 147     partstr = 'part%d' % partno
 148     e = manifest.makeelement(OPFNS('item'), attrib={
 149                                  'id': partstr,
 150                                  'href': partstr + '.html',
 151                                  'media-type': 'application/xhtml+xml',
 152                              })
 153     manifest.append(e)
 154
 155
 156 def add_to_spine(spine, partno):
 157     """ Adds a node to the spine section in content.opf file """
 158
 159     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 160     spine.append(e)
 161
 162
 163 class TOC(object):
 164     def __init__(self, name=None, part_number=None):
 165         self.children = []
 166         self.name = name
 167         self.part_number = part_number
 168         self.sub_number = None
 169
 170     def add(self, name, part_number, level=0, is_part=True):
 171         if level > 0 and self.children:
 172             return self.children[-1].add(name, part_number, level-1, is_part)
 173         else:
 174             t = TOC(name)
 175             t.part_number = part_number
 176             self.children.append(t)
 177             if not is_part:
 178                 t.sub_number = len(self.children) + 1
 179                 return t.sub_number
 180
 181     def append(self, toc):
 182         self.children.append(toc)
 183
 184     def extend(self, toc):
 185         self.children.extend(toc.children)
 186
 187     def depth(self):
 188         if self.children:
 189             return max((c.depth() for c in self.children)) + 1
 190         else:
 191             return 0
 192
 193     def write_to_xml(self, nav_map, counter):
 194         for child in self.children:
 195             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 196             nav_point.set('id', 'NavPoint-%d' % counter)
 197             nav_point.set('playOrder', str(counter))
 198
 199             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 200             text = nav_map.makeelement(NCXNS('text'))
 201             text.text = child.name
 202             nav_label.append(text)
 203             nav_point.append(nav_label)
 204
 205             content = nav_map.makeelement(NCXNS('content'))
 206             src = 'part%d.html' % child.part_number
 207             if child.sub_number is not None:
 208                 src += '#sub%d' % child.sub_number
 209             content.set('src', src)
 210             nav_point.append(content)
 211             nav_map.append(nav_point)
 212             counter = child.write_to_xml(nav_point, counter + 1)
 213         return counter
 214
 215
 216 def used_chars(element):
 217     """ Lists characters used in an ETree Element """
 218     chars = set((element.text or '') + (element.tail or ''))
 219     for child in element:
 220         chars = chars.union(used_chars(child))
 221     return chars
 222
 223
 224 def chop(main_text):
 225     """ divide main content of the XML file into chunks """
 226
 227     # prepare a container for each chunk
 228     part_xml = etree.Element('utwor')
 229     etree.SubElement(part_xml, 'master')
 230     main_xml_part = part_xml[0] # master
 231
 232     last_node_part = False
 233     for one_part in main_text:
 234         name = one_part.tag
 235         if name == 'naglowek_czesc':
 236             yield part_xml
 237             last_node_part = True
 238             main_xml_part[:] = [deepcopy(one_part)]
 239         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 240             yield part_xml
 241             main_xml_part[:] = [deepcopy(one_part)]
 242         else:
 243             main_xml_part.append(deepcopy(one_part))
 244             last_node_part = False
 245     yield part_xml
 246
 247
 248 def transform_chunk(chunk_xml, chunk_no, annotations):
 249     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 250
 251     toc = TOC()
 252     for element in chunk_xml[0]:
 253         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 254             toc.add(node_name(element), chunk_no)
 255         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 256             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 257             element.set('sub', str(subnumber))
 258     find_annotations(annotations, chunk_xml, chunk_no)
 259     replace_by_verse(chunk_xml)
 260     html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
 261     chars = used_chars(html_tree.getroot())
 262     output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 263     return output_html, toc, chars
 264
 265
 266 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False):
 267     """ produces a EPUB file
 268
 269     provider: a DocProvider
 270     slug: slug of file to process, available by provider
 271     output_file: file-like object or path to output file
 272     output_dir: path to directory to save output file to; either this or output_file must be present
 273     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 274     """
 275
 276     def transform_file(input_xml, chunk_counter=1, first=True):
 277         """ processes one input file and proceeds to its children """
 278
 279         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 280
 281         # every input file will have a TOC entry,
 282         # pointing to starting chunk
 283         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 284         chars = set()
 285         if first:
 286             # write book title page
 287             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
 288             chars = used_chars(html_tree.getroot())
 289             zip.writestr('OPS/title.html',
 290                  etree.tostring(html_tree, method="html", pretty_print=True))
 291         elif children:
 292             # write title page for every parent
 293             html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
 294             chars = used_chars(html_tree.getroot())
 295             zip.writestr('OPS/part%d.html' % chunk_counter,
 296                 etree.tostring(html_tree, method="html", pretty_print=True))
 297             add_to_manifest(manifest, chunk_counter)
 298             add_to_spine(spine, chunk_counter)
 299             chunk_counter += 1
 300
 301         if len(input_xml.getroot()) > 1:
 302             # rdf before style master
 303             main_text = input_xml.getroot()[1]
 304         else:
 305             # rdf in style master
 306             main_text = input_xml.getroot()[0]
 307             if main_text.tag == RDFNS('RDF'):
 308                 main_text = None
 309
 310         if main_text is not None:
 311             replace_characters(main_text)
 312
 313             for chunk_xml in chop(main_text):
 314                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
 315                 toc.extend(chunk_toc)
 316                 chars = chars.union(chunk_chars)
 317                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 318                 add_to_manifest(manifest, chunk_counter)
 319                 add_to_spine(spine, chunk_counter)
 320                 chunk_counter += 1
 321
 322         if children:
 323             for child in children:
 324                 child_xml = etree.parse(provider.by_uri(child))
 325                 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
 326                 toc.append(child_toc)
 327                 chars = chars.union(chunk_chars)
 328
 329         return toc, chunk_counter, chars
 330
 331     # read metadata from the first file
 332     if file_path:
 333         if slug:
 334             raise ValueError('slug or file_path should be specified, not both')
 335         f = open(file_path, 'r')
 336         input_xml = etree.parse(f)
 337         f.close()
 338     else:
 339         if not slug:
 340             raise ValueError('either slug or file_path should be specified')
 341         input_xml = etree.parse(provider[slug])
 342
 343     metadata = input_xml.find('.//'+RDFNS('Description'))
 344     if metadata is None:
 345         raise NoDublinCore('Document has no DublinCore - which is required.')
 346     book_info = BookInfo.from_element(input_xml)
 347     metadata = etree.ElementTree(metadata)
 348
 349     # if output to dir, create the file
 350     if output_dir is not None:
 351         if make_dir:
 352             author = unicode(book_info.author)
 353             output_dir = os.path.join(output_dir, author)
 354             try:
 355                 os.makedirs(output_dir)
 356             except OSError:
 357                 pass
 358         if slug:
 359             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 360         else:
 361             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 362
 363     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 364
 365     # write static elements
 366     mime = zipfile.ZipInfo()
 367     mime.filename = 'mimetype'
 368     mime.compress_type = zipfile.ZIP_STORED
 369     mime.extra = ''
 370     zip.writestr(mime, 'application/epub+zip')
 371     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 372                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 373                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 374                        'media-type="application/oebps-package+xml" />' \
 375                        '</rootfiles></container>')
 376     for fname in 'style.css', 'logo_wolnelektury.png':
 377         zip.write(res(fname), os.path.join('OPS', fname))
 378
 379     opf = xslt(metadata, res('xsltContent.xsl'))
 380     manifest = opf.find('.//' + OPFNS('manifest'))
 381     spine = opf.find('.//' + OPFNS('spine'))
 382
 383     annotations = etree.Element('annotations')
 384
 385     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 386                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 387                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 388                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 389                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 390                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 391                                '</navPoint></navMap></ncx>')
 392     nav_map = toc_file[-1]
 393
 394     toc, chunk_counter, chars = transform_file(input_xml)
 395
 396     if not toc.children:
 397         toc.add(u"Początek utworu", 1)
 398     toc_counter = toc.write_to_xml(nav_map, 2)
 399
 400     # Last modifications in container files and EPUB creation
 401     if len(annotations) > 0:
 402         nav_map.append(etree.fromstring(
 403             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 404             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 405         manifest.append(etree.fromstring(
 406             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 407         spine.append(etree.fromstring(
 408             '<itemref idref="annotations" />'))
 409         replace_by_verse(annotations)
 410         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
 411         chars = chars.union(used_chars(html_tree.getroot()))
 412         zip.writestr('OPS/annotations.html', etree.tostring(
 413                             html_tree, method="html", pretty_print=True))
 414
 415     # strip fonts
 416     tmpdir = mkdtemp('-librarian-epub')
 417     cwd = os.getcwd()
 418
 419     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 420     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 421         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)]
 422         if verbose:
 423             print "Running font-optimizer"
 424             subprocess.check_call(optimizer_call)
 425         else:
 426             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 427         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 428     rmtree(tmpdir)
 429     os.chdir(cwd)
 430
 431     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 432     contents = []
 433     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 434     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 435     for st in attributes:
 436         meta = toc_file.makeelement(NCXNS('meta'))
 437         meta.set('name', st)
 438         meta.set('content', '0')
 439         toc_file[0].append(meta)
 440     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 441     toc_file[0][1].set('content', str(toc.depth()))
 442     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 443     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 444     zip.close()