librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import shutil
  11 import sys
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15
  16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS, NoDublinCore
  17 from librarian.parser import WLDocument
  18
  19 #TODO: shouldn't be repeated here
  20 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
  21 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  22
  23
  24 class DocProvider(object):
  25     class DoesNotExist(Exception):
  26         pass
  27
  28     def by_slug(self, slug):
  29         raise NotImplemented
  30
  31     def __getitem__(self, slug):
  32         return self.by_slug(slug)
  33
  34     def by_uri(self, uri):
  35         return self.by_slug(uri.rsplit('/', 1)[1])
  36
  37
  38 class DirDocProvider(DocProvider):
  39     def __init__(self, dir):
  40         self.dir = dir
  41         self.files = {}
  42
  43     def by_slug(self, slug):
  44         return open(os.path.join(self.dir, '%s.xml' % slug))
  45
  46
  47 def inner_xml(node):
  48     """ returns node's text and children as a string
  49
  50     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  51     x<b>y</b>z
  52     """
  53
  54     nt = node.text if node.text is not None else ''
  55     return ''.join([nt] + [etree.tostring(child) for child in node])
  56
  57 def set_inner_xml(node, text):
  58     """ sets node's text and children from a string
  59
  60     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  61     >>> set_inner_xml(e, 'x<b>y</b>z')
  62     >>> print etree.tostring(e)
  63     <a>x<b>y</b>z</a>
  64     """
  65
  66
  67     p = etree.fromstring('<x>%s</x>' % text)
  68     node.text = p.text
  69     node[:] = p[:]
  70
  71
  72 def node_name(node):
  73     """ Find out a node's name
  74
  75     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  76     XYZ
  77     """
  78
  79     tempnode = deepcopy(node)
  80
  81     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  82         for e in tempnode.findall('.//%s' % p):
  83             t = e.tail
  84             e.clear()
  85             e.tail = t
  86     etree.strip_tags(tempnode, '*')
  87     return tempnode.text
  88
  89
  90 def xslt(xml, sheet):
  91     if isinstance(xml, etree._Element):
  92         xml = etree.ElementTree(xml)
  93     with open(sheet) as xsltf:
  94         return xml.xslt(etree.parse(xsltf))
  95
  96
  97 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
  98 def res(fname):
  99     return os.path.join(_resdir, fname)
 100
 101
 102 def replace_characters(node):
 103     def replace_chars(text):
 104         if text is None:
 105             return None
 106         return text.replace("&", "&amp;")\
 107                    .replace("---", "&#8212;")\
 108                    .replace("--", "&#8211;")\
 109                    .replace(",,", "&#8222;")\
 110                    .replace('"', "&#8221;")\
 111                    .replace("'", "&#8217;")
 112     if node.tag == 'extra':
 113         node.clear()
 114     else:
 115         node.text = replace_chars(node.text)
 116         node.tail = replace_chars(node.tail)
 117         for child in node:
 118             replace_characters(child)
 119
 120
 121 def find_annotations(annotations, source, part_no):
 122     for child in source:
 123         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 124             annotation = deepcopy(child)
 125             number = str(len(annotations)+1)
 126             annotation.set('number', number)
 127             annotation.set('part', str(part_no))
 128             annotation.tail = ''
 129             annotations.append(annotation)
 130             tail = child.tail
 131             child.clear()
 132             child.tail = tail
 133             child.text = number
 134         if child.tag not in ('extra', 'podtytul'):
 135             find_annotations(annotations, child, part_no)
 136
 137
 138 def replace_by_verse(tree):
 139     """ Find stanzas and create new verses in place of a '/' character """
 140
 141     stanzas = tree.findall('.//' + WLNS('strofa'))
 142     for node in stanzas:
 143         for child_node in node:
 144             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 145                 foreign_verses = inner_xml(child_node).split('/\n')
 146                 if len(foreign_verses) > 1:
 147                     new_foreign = ''
 148                     for foreign_verse in foreign_verses:
 149                         if foreign_verse.startswith('<wers'):
 150                             new_foreign += foreign_verse
 151                         else:
 152                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 153                     set_inner_xml(child_node, new_foreign)
 154         verses = inner_xml(node).split('/\n')
 155         if len(verses) > 1:
 156             modified_inner_xml = ''
 157             for verse in verses:
 158                 if verse.startswith('<wers') or verse.startswith('<extra'):
 159                     modified_inner_xml += verse
 160                 else:
 161                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 162             set_inner_xml(node, modified_inner_xml)
 163
 164
 165 def add_to_manifest(manifest, partno):
 166     """ Adds a node to the manifest section in content.opf file """
 167
 168     partstr = 'part%d' % partno
 169     e = manifest.makeelement(OPFNS('item'), attrib={
 170                                  'id': partstr,
 171                                  'href': partstr + '.html',
 172                                  'media-type': 'application/xhtml+xml',
 173                              })
 174     manifest.append(e)
 175
 176
 177 def add_to_spine(spine, partno):
 178     """ Adds a node to the spine section in content.opf file """
 179
 180     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 181     spine.append(e)
 182
 183
 184 class TOC(object):
 185     def __init__(self, name=None, part_number=None):
 186         self.children = []
 187         self.name = name
 188         self.part_number = part_number
 189         self.sub_number = None
 190
 191     def add(self, name, part_number, level=0, is_part=True):
 192         if level > 0 and self.children:
 193             return self.children[-1].add(name, part_number, level-1, is_part)
 194         else:
 195             t = TOC(name)
 196             t.part_number = part_number
 197             self.children.append(t)
 198             if not is_part:
 199                 t.sub_number = len(self.children) + 1
 200                 return t.sub_number
 201
 202     def append(self, toc):
 203         self.children.append(toc)
 204
 205     def extend(self, toc):
 206         self.children.extend(toc.children)
 207
 208     def depth(self):
 209         if self.children:
 210             return max((c.depth() for c in self.children)) + 1
 211         else:
 212             return 0
 213
 214     def write_to_xml(self, nav_map, counter):
 215         for child in self.children:
 216             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 217             nav_point.set('id', 'NavPoint-%d' % counter)
 218             nav_point.set('playOrder', str(counter))
 219
 220             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 221             text = nav_map.makeelement(NCXNS('text'))
 222             text.text = child.name
 223             nav_label.append(text)
 224             nav_point.append(nav_label)
 225
 226             content = nav_map.makeelement(NCXNS('content'))
 227             src = 'part%d.html' % child.part_number
 228             if child.sub_number is not None:
 229                 src += '#sub%d' % child.sub_number
 230             content.set('src', src)
 231             nav_point.append(content)
 232             nav_map.append(nav_point)
 233             counter = child.write_to_xml(nav_point, counter + 1)
 234         return counter
 235
 236
 237 def chop(main_text):
 238     """ divide main content of the XML file into chunks """
 239
 240     # prepare a container for each chunk
 241     part_xml = etree.Element('utwor')
 242     etree.SubElement(part_xml, 'master')
 243     main_xml_part = part_xml[0] # master
 244
 245     last_node_part = False
 246     for one_part in main_text:
 247         name = one_part.tag
 248         if name == 'naglowek_czesc':
 249             yield part_xml
 250             last_node_part = True
 251             main_xml_part[:] = [deepcopy(one_part)]
 252         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 253             yield part_xml
 254             main_xml_part[:] = [deepcopy(one_part)]
 255         else:
 256             main_xml_part.append(deepcopy(one_part))
 257             last_node_part = False
 258     yield part_xml
 259
 260
 261 def transform_chunk(chunk_xml, chunk_no, annotations):
 262     """ transforms one chunk, returns a HTML string and a TOC object """
 263
 264     toc = TOC()
 265     for element in chunk_xml[0]:
 266         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 267             toc.add(node_name(element), chunk_no)
 268         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 269             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 270             element.set('sub', str(subnumber))
 271     find_annotations(annotations, chunk_xml, chunk_no)
 272     replace_by_verse(chunk_xml)
 273     output_html = etree.tostring(xslt(chunk_xml, res('xsltScheme.xsl')), pretty_print=True)
 274     return output_html, toc
 275
 276
 277 def transform(provider, slug, output_file):
 278     """ produces an epub
 279
 280     provider is a DocProvider
 281     output_file should be filelike object
 282     """
 283
 284     def transform_file(input_xml, chunk_counter=1, first=True):
 285         """ processes one input file and proceeds to its children """
 286
 287         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 288
 289         # every input file will have a TOC entry,
 290         # pointing to starting chunk
 291         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 292         if first:
 293             # write book title page
 294             zip.writestr('OPS/title.html',
 295                  etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
 296         elif children:
 297             # write title page for every parent
 298             zip.writestr('OPS/part%d.html' % chunk_counter,
 299                 etree.tostring(xslt(input_xml, res('xsltChunkTitle.xsl')), pretty_print=True))
 300             add_to_manifest(manifest, chunk_counter)
 301             add_to_spine(spine, chunk_counter)
 302             chunk_counter += 1
 303
 304         if len(input_xml.getroot()) > 1:
 305             # rdf before style master
 306             main_text = input_xml.getroot()[1]
 307         else:
 308             # rdf in style master
 309             main_text = input_xml.getroot()[0]
 310             if main_text.tag == RDFNS('RDF'):
 311                 main_text = None
 312
 313         if main_text is not None:
 314             replace_characters(main_text)
 315
 316             for chunk_no, chunk_xml in enumerate(chop(main_text), chunk_counter):
 317                 chunk_html, chunk_toc = transform_chunk(chunk_xml, chunk_counter, annotations)
 318                 toc.extend(chunk_toc)
 319                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 320                 add_to_manifest(manifest, chunk_counter)
 321                 add_to_spine(spine, chunk_counter)
 322                 chunk_counter += 1
 323
 324         if children:
 325             for child in children:
 326                 child_xml = etree.parse(provider.by_uri(child))
 327                 child_toc, chunk_counter = transform_file(child_xml, chunk_counter, first=False)
 328                 toc.append(child_toc)
 329
 330         return toc, chunk_counter
 331
 332
 333     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 334
 335     # write static elements
 336     mime = zipfile.ZipInfo()
 337     mime.filename = 'mimetype'
 338     mime.compress_type = zipfile.ZIP_STORED
 339     mime.extra = ''
 340     zip.writestr(mime, 'application/epub+zip')
 341     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 342                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 343                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 344                        'media-type="application/oebps-package+xml" />' \
 345                        '</rootfiles></container>')
 346     for fname in 'style.css', 'logo_wolnelektury.png':
 347         zip.write(res(fname), os.path.join('OPS', fname))
 348
 349     # metadata from first file
 350     input_xml = etree.parse(provider[slug])
 351     metadata = input_xml.find('.//'+RDFNS('Description'))
 352     if metadata is None:
 353         raise NoDublinCore('Document has no DublinCore - which is required.')
 354     metadata = etree.ElementTree(metadata)
 355     opf = xslt(metadata, res('xsltContent.xsl'))
 356     manifest = opf.find('.//' + OPFNS('manifest'))
 357     spine = opf.find('.//' + OPFNS('spine'))
 358
 359     annotations = etree.Element('annotations')
 360
 361     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 362                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 363                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 364                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 365                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 366                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 367                                '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
 368                                '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
 369                                '</navPoint></navMap></ncx>')
 370     nav_map = toc_file[-1]
 371
 372     toc, chunk_counter = transform_file(input_xml)
 373     toc_counter = toc.write_to_xml(nav_map, 3) # we already have 2 navpoints
 374
 375     # Last modifications in container files and EPUB creation
 376     if len(annotations) > 0:
 377         nav_map.append(etree.fromstring(
 378             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 379             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 380         manifest.append(etree.fromstring(
 381             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 382         spine.append(etree.fromstring(
 383             '<itemref idref="annotations" />'))
 384         replace_by_verse(annotations)
 385         zip.writestr('OPS/annotations.html', etree.tostring(
 386                             xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
 387
 388     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 389     contents = []
 390     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 391     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 392     for st in attributes:
 393         meta = toc_file.makeelement(NCXNS('meta'))
 394         meta.set('name', st)
 395         meta.set('content', '0')
 396         toc_file[0].append(meta)
 397     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 398     toc_file[0][1].set('content', str(toc.depth()))
 399     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 400     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 401     zip.close()
 402
 403
 404 if __name__ == '__main__':
 405     if len(sys.argv) < 2:
 406         print >> sys.stderr, 'Usage: python epub.py <input file>'
 407         sys.exit(1)
 408
 409     main_input = sys.argv[1]
 410     basepath, ext = os.path.splitext(main_input)
 411     path, slug = os.path.realpath(basepath).rsplit('/', 1)
 412     output = basepath + '.epub'
 413     provider = DirDocProvider(path)
 414     transform(provider, slug, open(output, 'w'))
 415