librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from copy import deepcopy
  12 from lxml import etree
  13 import zipfile
  14 from tempfile import mkdtemp
  15 from shutil import rmtree
  16
  17 import sys
  18 sys.path.append('..') # for running from working copy
  19
  20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  21 from librarian.dcparser import BookInfo
  22
  23
  24 def inner_xml(node):
  25     """ returns node's text and children as a string
  26
  27     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  28     x<b>y</b>z
  29     """
  30
  31     nt = node.text if node.text is not None else ''
  32     return ''.join([nt] + [etree.tostring(child) for child in node])
  33
  34 def set_inner_xml(node, text):
  35     """ sets node's text and children from a string
  36
  37     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  38     >>> set_inner_xml(e, 'x<b>y</b>z')
  39     >>> print etree.tostring(e)
  40     <a>x<b>y</b>z</a>
  41     """
  42
  43     p = etree.fromstring('<x>%s</x>' % text)
  44     node.text = p.text
  45     node[:] = p[:]
  46
  47
  48 def node_name(node):
  49     """ Find out a node's name
  50
  51     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  52     XYZ
  53     """
  54
  55     tempnode = deepcopy(node)
  56
  57     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  58         for e in tempnode.findall('.//%s' % p):
  59             t = e.tail
  60             e.clear()
  61             e.tail = t
  62     etree.strip_tags(tempnode, '*')
  63     return tempnode.text
  64
  65
  66 def xslt(xml, sheet):
  67     if isinstance(xml, etree._Element):
  68         xml = etree.ElementTree(xml)
  69     with open(sheet) as xsltf:
  70         return xml.xslt(etree.parse(xsltf))
  71
  72
  73 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
  74 def res(fname):
  75     return os.path.join(_resdir, fname)
  76
  77
  78 def replace_characters(node):
  79     def replace_chars(text):
  80         if text is None:
  81             return None
  82         return text.replace("---", u"\u2014")\
  83                    .replace("--", u"\u2013")\
  84                    .replace(",,", u"\u201E")\
  85                    .replace('"', u"\u201D")\
  86                    .replace("'", u"\u2019")
  87     if node.tag == 'extra':
  88         node.clear()
  89     else:
  90         node.text = replace_chars(node.text)
  91         node.tail = replace_chars(node.tail)
  92         for child in node:
  93             replace_characters(child)
  94
  95
  96 def find_annotations(annotations, source, part_no):
  97     for child in source:
  98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  99             annotation = deepcopy(child)
 100             number = str(len(annotations)+1)
 101             annotation.set('number', number)
 102             annotation.set('part', str(part_no))
 103             annotation.tail = ''
 104             annotations.append(annotation)
 105             tail = child.tail
 106             child.clear()
 107             child.tail = tail
 108             child.text = number
 109         if child.tag not in ('extra', 'podtytul'):
 110             find_annotations(annotations, child, part_no)
 111
 112
 113 def replace_by_verse(tree):
 114     """ Find stanzas and create new verses in place of a '/' character """
 115
 116     stanzas = tree.findall('.//' + WLNS('strofa'))
 117     for node in stanzas:
 118         for child_node in node:
 119             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 120                 foreign_verses = inner_xml(child_node).split('/\n')
 121                 if len(foreign_verses) > 1:
 122                     new_foreign = ''
 123                     for foreign_verse in foreign_verses:
 124                         if foreign_verse.startswith('<wers'):
 125                             new_foreign += foreign_verse
 126                         else:
 127                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 128                     set_inner_xml(child_node, new_foreign)
 129         verses = inner_xml(node).split('/\n')
 130         if len(verses) > 1:
 131             modified_inner_xml = ''
 132             for verse in verses:
 133                 if verse.startswith('<wers') or verse.startswith('<extra'):
 134                     modified_inner_xml += verse
 135                 else:
 136                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 137             set_inner_xml(node, modified_inner_xml)
 138
 139
 140 def add_to_manifest(manifest, partno):
 141     """ Adds a node to the manifest section in content.opf file """
 142
 143     partstr = 'part%d' % partno
 144     e = manifest.makeelement(OPFNS('item'), attrib={
 145                                  'id': partstr,
 146                                  'href': partstr + '.html',
 147                                  'media-type': 'application/xhtml+xml',
 148                              })
 149     manifest.append(e)
 150
 151
 152 def add_to_spine(spine, partno):
 153     """ Adds a node to the spine section in content.opf file """
 154
 155     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 156     spine.append(e)
 157
 158
 159 class TOC(object):
 160     def __init__(self, name=None, part_number=None):
 161         self.children = []
 162         self.name = name
 163         self.part_number = part_number
 164         self.sub_number = None
 165
 166     def add(self, name, part_number, level=0, is_part=True):
 167         if level > 0 and self.children:
 168             return self.children[-1].add(name, part_number, level-1, is_part)
 169         else:
 170             t = TOC(name)
 171             t.part_number = part_number
 172             self.children.append(t)
 173             if not is_part:
 174                 t.sub_number = len(self.children) + 1
 175                 return t.sub_number
 176
 177     def append(self, toc):
 178         self.children.append(toc)
 179
 180     def extend(self, toc):
 181         self.children.extend(toc.children)
 182
 183     def depth(self):
 184         if self.children:
 185             return max((c.depth() for c in self.children)) + 1
 186         else:
 187             return 0
 188
 189     def write_to_xml(self, nav_map, counter):
 190         for child in self.children:
 191             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 192             nav_point.set('id', 'NavPoint-%d' % counter)
 193             nav_point.set('playOrder', str(counter))
 194
 195             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 196             text = nav_map.makeelement(NCXNS('text'))
 197             text.text = child.name
 198             nav_label.append(text)
 199             nav_point.append(nav_label)
 200
 201             content = nav_map.makeelement(NCXNS('content'))
 202             src = 'part%d.html' % child.part_number
 203             if child.sub_number is not None:
 204                 src += '#sub%d' % child.sub_number
 205             content.set('src', src)
 206             nav_point.append(content)
 207             nav_map.append(nav_point)
 208             counter = child.write_to_xml(nav_point, counter + 1)
 209         return counter
 210
 211
 212 def used_chars(element):
 213     """ Lists characters used in an ETree Element """
 214     print (element.text or '') + (element.tail or '')
 215     chars = set((element.text or '') + (element.tail or ''))
 216     for child in element:
 217         chars = chars.union(used_chars(child))
 218     return chars
 219
 220
 221 def chop(main_text):
 222     """ divide main content of the XML file into chunks """
 223
 224     # prepare a container for each chunk
 225     part_xml = etree.Element('utwor')
 226     etree.SubElement(part_xml, 'master')
 227     main_xml_part = part_xml[0] # master
 228
 229     last_node_part = False
 230     for one_part in main_text:
 231         name = one_part.tag
 232         if name == 'naglowek_czesc':
 233             yield part_xml
 234             last_node_part = True
 235             main_xml_part[:] = [deepcopy(one_part)]
 236         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 237             yield part_xml
 238             main_xml_part[:] = [deepcopy(one_part)]
 239         else:
 240             main_xml_part.append(deepcopy(one_part))
 241             last_node_part = False
 242     yield part_xml
 243
 244
 245 def transform_chunk(chunk_xml, chunk_no, annotations):
 246     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 247
 248     toc = TOC()
 249     for element in chunk_xml[0]:
 250         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 251             toc.add(node_name(element), chunk_no)
 252         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 253             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 254             element.set('sub', str(subnumber))
 255     find_annotations(annotations, chunk_xml, chunk_no)
 256     replace_by_verse(chunk_xml)
 257     html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
 258     chars = used_chars(html_tree.getroot())
 259     output_html = etree.tostring(html_tree, pretty_print=True)
 260     return output_html, toc, chars
 261
 262
 263 def transform(provider, slug, output_file=None, output_dir=None):
 264     """ produces an epub
 265
 266     provider is a DocProvider
 267     either output_file (a file-like object) or output_dir (path to file/dir) should be specified
 268     if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
 269     """
 270
 271     def transform_file(input_xml, chunk_counter=1, first=True):
 272         """ processes one input file and proceeds to its children """
 273
 274         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 275
 276         # every input file will have a TOC entry,
 277         # pointing to starting chunk
 278         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 279         chars = set()
 280         if first:
 281             # write book title page
 282             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
 283             chars = used_chars(html_tree.getroot())
 284             zip.writestr('OPS/title.html',
 285                  etree.tostring(html_tree, pretty_print=True))
 286         elif children:
 287             # write title page for every parent
 288             html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
 289             chars = used_chars(html_tree.getroot())
 290             zip.writestr('OPS/part%d.html' % chunk_counter,
 291                 etree.tostring(html_tree, pretty_print=True))
 292             add_to_manifest(manifest, chunk_counter)
 293             add_to_spine(spine, chunk_counter)
 294             chunk_counter += 1
 295
 296         if len(input_xml.getroot()) > 1:
 297             # rdf before style master
 298             main_text = input_xml.getroot()[1]
 299         else:
 300             # rdf in style master
 301             main_text = input_xml.getroot()[0]
 302             if main_text.tag == RDFNS('RDF'):
 303                 main_text = None
 304
 305         if main_text is not None:
 306             replace_characters(main_text)
 307
 308             for chunk_xml in chop(main_text):
 309                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
 310                 toc.extend(chunk_toc)
 311                 chars = chars.union(chunk_chars)
 312                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 313                 add_to_manifest(manifest, chunk_counter)
 314                 add_to_spine(spine, chunk_counter)
 315                 chunk_counter += 1
 316
 317         if children:
 318             for child in children:
 319                 child_xml = etree.parse(provider.by_uri(child))
 320                 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
 321                 toc.append(child_toc)
 322                 chars = chars.union(chunk_chars)
 323
 324         return toc, chunk_counter, chars
 325
 326     # read metadata from the first file
 327     input_xml = etree.parse(provider[slug])
 328     metadata = input_xml.find('.//'+RDFNS('Description'))
 329     if metadata is None:
 330         raise NoDublinCore('Document has no DublinCore - which is required.')
 331     book_info = BookInfo.from_element(input_xml)
 332     metadata = etree.ElementTree(metadata)
 333
 334     # if output to dir, create the file
 335     if output_dir is not None:
 336         author = unicode(book_info.author)
 337         author_dir = os.path.join(output_dir, author)
 338         try:
 339             os.makedirs(author_dir)
 340         except OSError:
 341             pass
 342         output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
 343
 344
 345     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 346
 347     # write static elements
 348     mime = zipfile.ZipInfo()
 349     mime.filename = 'mimetype'
 350     mime.compress_type = zipfile.ZIP_STORED
 351     mime.extra = ''
 352     zip.writestr(mime, 'application/epub+zip')
 353     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 354                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 355                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 356                        'media-type="application/oebps-package+xml" />' \
 357                        '</rootfiles></container>')
 358     for fname in 'style.css', 'logo_wolnelektury.png':
 359         zip.write(res(fname), os.path.join('OPS', fname))
 360
 361     opf = xslt(metadata, res('xsltContent.xsl'))
 362     manifest = opf.find('.//' + OPFNS('manifest'))
 363     spine = opf.find('.//' + OPFNS('spine'))
 364
 365     annotations = etree.Element('annotations')
 366
 367     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 368                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 369                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 370                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 371                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 372                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 373                                '</navPoint></navMap></ncx>')
 374     nav_map = toc_file[-1]
 375
 376     toc, chunk_counter, chars = transform_file(input_xml)
 377
 378     if not toc.children:
 379         toc.add(u"Początek utworu", 1)
 380     toc_counter = toc.write_to_xml(nav_map, 2)
 381
 382     # Last modifications in container files and EPUB creation
 383     if len(annotations) > 0:
 384         nav_map.append(etree.fromstring(
 385             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 386             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 387         manifest.append(etree.fromstring(
 388             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 389         spine.append(etree.fromstring(
 390             '<itemref idref="annotations" />'))
 391         replace_by_verse(annotations)
 392         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
 393         chars = chars.union(used_chars(html_tree.getroot()))
 394         zip.writestr('OPS/annotations.html', etree.tostring(
 395                             html_tree, pretty_print=True))
 396
 397     # strip fonts
 398     tmpdir = mkdtemp('-librarian-epub')
 399     cwd = os.getcwd()
 400
 401     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../font-optimizer'))
 402     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 403         subprocess.check_call(['./subset.pl', '--chars', ''.join(chars), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
 404         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 405     rmtree(tmpdir)
 406     os.chdir(cwd)
 407
 408     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 409     contents = []
 410     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 411     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 412     for st in attributes:
 413         meta = toc_file.makeelement(NCXNS('meta'))
 414         meta.set('name', st)
 415         meta.set('content', '0')
 416         toc_file[0].append(meta)
 417     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 418     toc_file[0][1].set('content', str(toc.depth()))
 419     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 420     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 421     zip.close()
 422
 423
 424 if __name__ == '__main__':
 425     from librarian import DirDocProvider
 426
 427     if len(sys.argv) < 2:
 428         print >> sys.stderr, 'Usage: python epub.py <input file>'
 429         sys.exit(1)
 430
 431     main_input = sys.argv[1]
 432     basepath, ext = os.path.splitext(main_input)
 433     path, slug = os.path.realpath(basepath).rsplit('/', 1)
 434     provider = DirDocProvider(path)
 435     transform(provider, slug, output_dir=path)
 436