librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from copy import deepcopy
  12 from lxml import etree
  13 import zipfile
  14 from tempfile import mkdtemp
  15 from shutil import rmtree
  16
  17 import sys
  18
  19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  20 from librarian.dcparser import BookInfo
  21
  22 from librarian import functions
  23
  24 functions.reg_person_name()
  25
  26
  27 def inner_xml(node):
  28     """ returns node's text and children as a string
  29
  30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  31     x<b>y</b>z
  32     """
  33
  34     nt = node.text if node.text is not None else ''
  35     return ''.join([nt] + [etree.tostring(child) for child in node])
  36
  37 def set_inner_xml(node, text):
  38     """ sets node's text and children from a string
  39
  40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  41     >>> set_inner_xml(e, 'x<b>y</b>z')
  42     >>> print etree.tostring(e)
  43     <a>x<b>y</b>z</a>
  44     """
  45
  46     p = etree.fromstring('<x>%s</x>' % text)
  47     node.text = p.text
  48     node[:] = p[:]
  49
  50
  51 def node_name(node):
  52     """ Find out a node's name
  53
  54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  55     XYZ
  56     """
  57
  58     tempnode = deepcopy(node)
  59
  60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  61         for e in tempnode.findall('.//%s' % p):
  62             t = e.tail
  63             e.clear()
  64             e.tail = t
  65     etree.strip_tags(tempnode, '*')
  66     return tempnode.text
  67
  68
  69 def xslt(xml, sheet):
  70     if isinstance(xml, etree._Element):
  71         xml = etree.ElementTree(xml)
  72     with open(sheet) as xsltf:
  73         return xml.xslt(etree.parse(xsltf))
  74
  75
  76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
  77 def res(fname):
  78     return os.path.join(_resdir, fname)
  79
  80
  81 def replace_characters(node):
  82     def replace_chars(text):
  83         if text is None:
  84             return None
  85         return text.replace("---", u"\u2014")\
  86                    .replace("--", u"\u2013")\
  87                    .replace(",,", u"\u201E")\
  88                    .replace('"', u"\u201D")\
  89                    .replace("'", u"\u2019")
  90     if node.tag == 'extra':
  91         node.clear()
  92     else:
  93         node.text = replace_chars(node.text)
  94         node.tail = replace_chars(node.tail)
  95         for child in node:
  96             replace_characters(child)
  97
  98
  99 def find_annotations(annotations, source, part_no):
 100     for child in source:
 101         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 102             annotation = deepcopy(child)
 103             number = str(len(annotations)+1)
 104             annotation.set('number', number)
 105             annotation.set('part', str(part_no))
 106             annotation.tail = ''
 107             annotations.append(annotation)
 108             tail = child.tail
 109             child.clear()
 110             child.tail = tail
 111             child.text = number
 112         if child.tag not in ('extra', 'podtytul'):
 113             find_annotations(annotations, child, part_no)
 114
 115
 116 def replace_by_verse(tree):
 117     """ Find stanzas and create new verses in place of a '/' character """
 118
 119     stanzas = tree.findall('.//' + WLNS('strofa'))
 120     for node in stanzas:
 121         for child_node in node:
 122             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 123                 foreign_verses = inner_xml(child_node).split('/\n')
 124                 if len(foreign_verses) > 1:
 125                     new_foreign = ''
 126                     for foreign_verse in foreign_verses:
 127                         if foreign_verse.startswith('<wers'):
 128                             new_foreign += foreign_verse
 129                         else:
 130                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 131                     set_inner_xml(child_node, new_foreign)
 132         verses = inner_xml(node).split('/\n')
 133         if len(verses) > 1:
 134             modified_inner_xml = ''
 135             for verse in verses:
 136                 if verse.startswith('<wers') or verse.startswith('<extra'):
 137                     modified_inner_xml += verse
 138                 else:
 139                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 140             set_inner_xml(node, modified_inner_xml)
 141
 142
 143 def add_to_manifest(manifest, partno):
 144     """ Adds a node to the manifest section in content.opf file """
 145
 146     partstr = 'part%d' % partno
 147     e = manifest.makeelement(OPFNS('item'), attrib={
 148                                  'id': partstr,
 149                                  'href': partstr + '.html',
 150                                  'media-type': 'application/xhtml+xml',
 151                              })
 152     manifest.append(e)
 153
 154
 155 def add_to_spine(spine, partno):
 156     """ Adds a node to the spine section in content.opf file """
 157
 158     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 159     spine.append(e)
 160
 161
 162 class TOC(object):
 163     def __init__(self, name=None, part_number=None):
 164         self.children = []
 165         self.name = name
 166         self.part_number = part_number
 167         self.sub_number = None
 168
 169     def add(self, name, part_number, level=0, is_part=True):
 170         if level > 0 and self.children:
 171             return self.children[-1].add(name, part_number, level-1, is_part)
 172         else:
 173             t = TOC(name)
 174             t.part_number = part_number
 175             self.children.append(t)
 176             if not is_part:
 177                 t.sub_number = len(self.children) + 1
 178                 return t.sub_number
 179
 180     def append(self, toc):
 181         self.children.append(toc)
 182
 183     def extend(self, toc):
 184         self.children.extend(toc.children)
 185
 186     def depth(self):
 187         if self.children:
 188             return max((c.depth() for c in self.children)) + 1
 189         else:
 190             return 0
 191
 192     def write_to_xml(self, nav_map, counter):
 193         for child in self.children:
 194             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 195             nav_point.set('id', 'NavPoint-%d' % counter)
 196             nav_point.set('playOrder', str(counter))
 197
 198             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 199             text = nav_map.makeelement(NCXNS('text'))
 200             text.text = child.name
 201             nav_label.append(text)
 202             nav_point.append(nav_label)
 203
 204             content = nav_map.makeelement(NCXNS('content'))
 205             src = 'part%d.html' % child.part_number
 206             if child.sub_number is not None:
 207                 src += '#sub%d' % child.sub_number
 208             content.set('src', src)
 209             nav_point.append(content)
 210             nav_map.append(nav_point)
 211             counter = child.write_to_xml(nav_point, counter + 1)
 212         return counter
 213
 214
 215 def used_chars(element):
 216     """ Lists characters used in an ETree Element """
 217     chars = set((element.text or '') + (element.tail or ''))
 218     for child in element:
 219         chars = chars.union(used_chars(child))
 220     return chars
 221
 222
 223 def chop(main_text):
 224     """ divide main content of the XML file into chunks """
 225
 226     # prepare a container for each chunk
 227     part_xml = etree.Element('utwor')
 228     etree.SubElement(part_xml, 'master')
 229     main_xml_part = part_xml[0] # master
 230
 231     last_node_part = False
 232     for one_part in main_text:
 233         name = one_part.tag
 234         if name == 'naglowek_czesc':
 235             yield part_xml
 236             last_node_part = True
 237             main_xml_part[:] = [deepcopy(one_part)]
 238         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 239             yield part_xml
 240             main_xml_part[:] = [deepcopy(one_part)]
 241         else:
 242             main_xml_part.append(deepcopy(one_part))
 243             last_node_part = False
 244     yield part_xml
 245
 246
 247 def transform_chunk(chunk_xml, chunk_no, annotations):
 248     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 249
 250     toc = TOC()
 251     for element in chunk_xml[0]:
 252         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 253             toc.add(node_name(element), chunk_no)
 254         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 255             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 256             element.set('sub', str(subnumber))
 257     find_annotations(annotations, chunk_xml, chunk_no)
 258     replace_by_verse(chunk_xml)
 259     html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
 260     chars = used_chars(html_tree.getroot())
 261     output_html = etree.tostring(html_tree, pretty_print=True)
 262     return output_html, toc, chars
 263
 264
 265 def transform(provider, slug, output_file=None, output_dir=None, make_dir=False):
 266     """ produces a EPUB file
 267
 268     provider: a DocProvider
 269     slug: slug of file to process, available by provider
 270     output_file: file-like object or path to output file
 271     output_dir: path to directory to save output file to; either this or output_file must be present
 272     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 273     """
 274
 275     def transform_file(input_xml, chunk_counter=1, first=True):
 276         """ processes one input file and proceeds to its children """
 277
 278         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 279
 280         # every input file will have a TOC entry,
 281         # pointing to starting chunk
 282         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 283         chars = set()
 284         if first:
 285             # write book title page
 286             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
 287             chars = used_chars(html_tree.getroot())
 288             zip.writestr('OPS/title.html',
 289                  etree.tostring(html_tree, pretty_print=True))
 290         elif children:
 291             # write title page for every parent
 292             html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
 293             chars = used_chars(html_tree.getroot())
 294             zip.writestr('OPS/part%d.html' % chunk_counter,
 295                 etree.tostring(html_tree, pretty_print=True))
 296             add_to_manifest(manifest, chunk_counter)
 297             add_to_spine(spine, chunk_counter)
 298             chunk_counter += 1
 299
 300         if len(input_xml.getroot()) > 1:
 301             # rdf before style master
 302             main_text = input_xml.getroot()[1]
 303         else:
 304             # rdf in style master
 305             main_text = input_xml.getroot()[0]
 306             if main_text.tag == RDFNS('RDF'):
 307                 main_text = None
 308
 309         if main_text is not None:
 310             replace_characters(main_text)
 311
 312             for chunk_xml in chop(main_text):
 313                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
 314                 toc.extend(chunk_toc)
 315                 chars = chars.union(chunk_chars)
 316                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 317                 add_to_manifest(manifest, chunk_counter)
 318                 add_to_spine(spine, chunk_counter)
 319                 chunk_counter += 1
 320
 321         if children:
 322             for child in children:
 323                 child_xml = etree.parse(provider.by_uri(child))
 324                 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
 325                 toc.append(child_toc)
 326                 chars = chars.union(chunk_chars)
 327
 328         return toc, chunk_counter, chars
 329
 330     # read metadata from the first file
 331     input_xml = etree.parse(provider[slug])
 332     metadata = input_xml.find('.//'+RDFNS('Description'))
 333     if metadata is None:
 334         raise NoDublinCore('Document has no DublinCore - which is required.')
 335     book_info = BookInfo.from_element(input_xml)
 336     metadata = etree.ElementTree(metadata)
 337
 338     # if output to dir, create the file
 339     if output_dir is not None:
 340         if make_dir:
 341             author = unicode(book_info.author)
 342             output_dir = os.path.join(output_dir, author)
 343             try:
 344                 os.makedirs(output_dir)
 345             except OSError:
 346                 pass
 347         output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 348
 349     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 350
 351     # write static elements
 352     mime = zipfile.ZipInfo()
 353     mime.filename = 'mimetype'
 354     mime.compress_type = zipfile.ZIP_STORED
 355     mime.extra = ''
 356     zip.writestr(mime, 'application/epub+zip')
 357     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 358                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 359                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 360                        'media-type="application/oebps-package+xml" />' \
 361                        '</rootfiles></container>')
 362     for fname in 'style.css', 'logo_wolnelektury.png':
 363         zip.write(res(fname), os.path.join('OPS', fname))
 364
 365     opf = xslt(metadata, res('xsltContent.xsl'))
 366     manifest = opf.find('.//' + OPFNS('manifest'))
 367     spine = opf.find('.//' + OPFNS('spine'))
 368
 369     annotations = etree.Element('annotations')
 370
 371     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 372                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 373                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 374                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 375                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 376                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 377                                '</navPoint></navMap></ncx>')
 378     nav_map = toc_file[-1]
 379
 380     toc, chunk_counter, chars = transform_file(input_xml)
 381
 382     if not toc.children:
 383         toc.add(u"Początek utworu", 1)
 384     toc_counter = toc.write_to_xml(nav_map, 2)
 385
 386     # Last modifications in container files and EPUB creation
 387     if len(annotations) > 0:
 388         nav_map.append(etree.fromstring(
 389             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 390             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 391         manifest.append(etree.fromstring(
 392             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 393         spine.append(etree.fromstring(
 394             '<itemref idref="annotations" />'))
 395         replace_by_verse(annotations)
 396         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
 397         chars = chars.union(used_chars(html_tree.getroot()))
 398         zip.writestr('OPS/annotations.html', etree.tostring(
 399                             html_tree, pretty_print=True))
 400
 401     # strip fonts
 402     tmpdir = mkdtemp('-librarian-epub')
 403     cwd = os.getcwd()
 404
 405     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 406     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 407         subprocess.check_call(['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
 408         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 409     rmtree(tmpdir)
 410     os.chdir(cwd)
 411
 412     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 413     contents = []
 414     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 415     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 416     for st in attributes:
 417         meta = toc_file.makeelement(NCXNS('meta'))
 418         meta.set('name', st)
 419         meta.set('content', '0')
 420         toc_file[0].append(meta)
 421     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 422     toc_file[0][1].set('content', str(toc.depth()))
 423     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 424     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 425     zip.close()