librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import shutil
  11 import sys
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15
  16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS, NoDublinCore
  17 from librarian.parser import WLDocument
  18
  19 #TODO: shouldn't be repeated here
  20 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
  21 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  22
  23
  24 class DocProvider(object):
  25     class DoesNotExist(Exception):
  26         pass
  27
  28     def by_slug(self, slug):
  29         raise NotImplemented
  30
  31     def __getitem__(self, slug):
  32         return self.by_slug(slug)
  33
  34     def by_uri(self, uri):
  35         return self.by_slug(uri.rsplit('/', 1)[1])
  36
  37
  38 class DirDocProvider(DocProvider):
  39     def __init__(self, dir):
  40         self.dir = dir
  41         self.files = {}
  42
  43     def by_slug(self, slug):
  44         return open(os.path.join(self.dir, '%s.xml' % slug))
  45
  46
  47 def inner_xml(node):
  48     """ returns node's text and children as a string
  49
  50     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  51     x<b>y</b>z
  52     """
  53
  54     nt = node.text if node.text is not None else ''
  55     return ''.join([nt] + [etree.tostring(child) for child in node])
  56
  57 def set_inner_xml(node, text):
  58     """ sets node's text and children from a string
  59
  60     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  61     >>> set_inner_xml(e, 'x<b>y</b>z')
  62     >>> print etree.tostring(e)
  63     <a>x<b>y</b>z</a>
  64     """
  65
  66
  67     p = etree.fromstring('<x>%s</x>' % text)
  68     node.text = p.text
  69     node[:] = p[:]
  70
  71
  72 def node_name(node):
  73     """ Find out a node's name
  74
  75     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  76     XYZ
  77     """
  78
  79     tempnode = deepcopy(node)
  80
  81     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  82         for e in tempnode.findall('.//%s' % p):
  83             t = e.tail
  84             e.clear()
  85             e.tail = t
  86     etree.strip_tags(tempnode, '*')
  87     return tempnode.text
  88
  89
  90 def xslt(xml, sheet):
  91     if isinstance(xml, etree._Element):
  92         xml = etree.ElementTree(xml)
  93     with open(sheet) as xsltf:
  94         return xml.xslt(etree.parse(xsltf))
  95
  96
  97 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
  98 def res(fname):
  99     return os.path.join(_resdir, fname)
 100
 101
 102 def replace_characters(node):
 103     def replace_chars(text):
 104         if text is None:
 105             return None
 106         return text.replace("---", u"\u2014")\
 107                    .replace("--", u"\u2013")\
 108                    .replace(",,", u"\u201E")\
 109                    .replace('"', u"\u201D")\
 110                    .replace("'", u"\u2019")
 111     if node.tag == 'extra':
 112         node.clear()
 113     else:
 114         node.text = replace_chars(node.text)
 115         node.tail = replace_chars(node.tail)
 116         for child in node:
 117             replace_characters(child)
 118
 119
 120 def find_annotations(annotations, source, part_no):
 121     for child in source:
 122         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 123             annotation = deepcopy(child)
 124             number = str(len(annotations)+1)
 125             annotation.set('number', number)
 126             annotation.set('part', str(part_no))
 127             annotation.tail = ''
 128             annotations.append(annotation)
 129             tail = child.tail
 130             child.clear()
 131             child.tail = tail
 132             child.text = number
 133         if child.tag not in ('extra', 'podtytul'):
 134             find_annotations(annotations, child, part_no)
 135
 136
 137 def replace_by_verse(tree):
 138     """ Find stanzas and create new verses in place of a '/' character """
 139
 140     stanzas = tree.findall('.//' + WLNS('strofa'))
 141     for node in stanzas:
 142         for child_node in node:
 143             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 144                 foreign_verses = inner_xml(child_node).split('/\n')
 145                 if len(foreign_verses) > 1:
 146                     new_foreign = ''
 147                     for foreign_verse in foreign_verses:
 148                         if foreign_verse.startswith('<wers'):
 149                             new_foreign += foreign_verse
 150                         else:
 151                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 152                     set_inner_xml(child_node, new_foreign)
 153         verses = inner_xml(node).split('/\n')
 154         if len(verses) > 1:
 155             modified_inner_xml = ''
 156             for verse in verses:
 157                 if verse.startswith('<wers') or verse.startswith('<extra'):
 158                     modified_inner_xml += verse
 159                 else:
 160                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 161             set_inner_xml(node, modified_inner_xml)
 162
 163
 164 def add_to_manifest(manifest, partno):
 165     """ Adds a node to the manifest section in content.opf file """
 166
 167     partstr = 'part%d' % partno
 168     e = manifest.makeelement(OPFNS('item'), attrib={
 169                                  'id': partstr,
 170                                  'href': partstr + '.html',
 171                                  'media-type': 'application/xhtml+xml',
 172                              })
 173     manifest.append(e)
 174
 175
 176 def add_to_spine(spine, partno):
 177     """ Adds a node to the spine section in content.opf file """
 178
 179     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 180     spine.append(e)
 181
 182
 183 class TOC(object):
 184     def __init__(self, name=None, part_number=None):
 185         self.children = []
 186         self.name = name
 187         self.part_number = part_number
 188         self.sub_number = None
 189
 190     def add(self, name, part_number, level=0, is_part=True):
 191         if level > 0 and self.children:
 192             return self.children[-1].add(name, part_number, level-1, is_part)
 193         else:
 194             t = TOC(name)
 195             t.part_number = part_number
 196             self.children.append(t)
 197             if not is_part:
 198                 t.sub_number = len(self.children) + 1
 199                 return t.sub_number
 200
 201     def append(self, toc):
 202         self.children.append(toc)
 203
 204     def extend(self, toc):
 205         self.children.extend(toc.children)
 206
 207     def depth(self):
 208         if self.children:
 209             return max((c.depth() for c in self.children)) + 1
 210         else:
 211             return 0
 212
 213     def write_to_xml(self, nav_map, counter):
 214         for child in self.children:
 215             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 216             nav_point.set('id', 'NavPoint-%d' % counter)
 217             nav_point.set('playOrder', str(counter))
 218
 219             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 220             text = nav_map.makeelement(NCXNS('text'))
 221             text.text = child.name
 222             nav_label.append(text)
 223             nav_point.append(nav_label)
 224
 225             content = nav_map.makeelement(NCXNS('content'))
 226             src = 'part%d.html' % child.part_number
 227             if child.sub_number is not None:
 228                 src += '#sub%d' % child.sub_number
 229             content.set('src', src)
 230             nav_point.append(content)
 231             nav_map.append(nav_point)
 232             counter = child.write_to_xml(nav_point, counter + 1)
 233         return counter
 234
 235
 236 def chop(main_text):
 237     """ divide main content of the XML file into chunks """
 238
 239     # prepare a container for each chunk
 240     part_xml = etree.Element('utwor')
 241     etree.SubElement(part_xml, 'master')
 242     main_xml_part = part_xml[0] # master
 243
 244     last_node_part = False
 245     for one_part in main_text:
 246         name = one_part.tag
 247         if name == 'naglowek_czesc':
 248             yield part_xml
 249             last_node_part = True
 250             main_xml_part[:] = [deepcopy(one_part)]
 251         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 252             yield part_xml
 253             main_xml_part[:] = [deepcopy(one_part)]
 254         else:
 255             main_xml_part.append(deepcopy(one_part))
 256             last_node_part = False
 257     yield part_xml
 258
 259
 260 def transform_chunk(chunk_xml, chunk_no, annotations):
 261     """ transforms one chunk, returns a HTML string and a TOC object """
 262
 263     toc = TOC()
 264     for element in chunk_xml[0]:
 265         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 266             toc.add(node_name(element), chunk_no)
 267         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 268             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 269             element.set('sub', str(subnumber))
 270     find_annotations(annotations, chunk_xml, chunk_no)
 271     replace_by_verse(chunk_xml)
 272     output_html = etree.tostring(xslt(chunk_xml, res('xsltScheme.xsl')), pretty_print=True)
 273     return output_html, toc
 274
 275
 276 def transform(provider, slug, output_file):
 277     """ produces an epub
 278
 279     provider is a DocProvider
 280     output_file should be filelike object
 281     """
 282
 283     def transform_file(input_xml, chunk_counter=1, first=True):
 284         """ processes one input file and proceeds to its children """
 285
 286         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 287
 288         # every input file will have a TOC entry,
 289         # pointing to starting chunk
 290         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 291         if first:
 292             # write book title page
 293             zip.writestr('OPS/title.html',
 294                  etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
 295         elif children:
 296             # write title page for every parent
 297             zip.writestr('OPS/part%d.html' % chunk_counter,
 298                 etree.tostring(xslt(input_xml, res('xsltChunkTitle.xsl')), pretty_print=True))
 299             add_to_manifest(manifest, chunk_counter)
 300             add_to_spine(spine, chunk_counter)
 301             chunk_counter += 1
 302
 303         if len(input_xml.getroot()) > 1:
 304             # rdf before style master
 305             main_text = input_xml.getroot()[1]
 306         else:
 307             # rdf in style master
 308             main_text = input_xml.getroot()[0]
 309             if main_text.tag == RDFNS('RDF'):
 310                 main_text = None
 311
 312         if main_text is not None:
 313             replace_characters(main_text)
 314
 315             for chunk_no, chunk_xml in enumerate(chop(main_text), chunk_counter):
 316                 chunk_html, chunk_toc = transform_chunk(chunk_xml, chunk_counter, annotations)
 317                 toc.extend(chunk_toc)
 318                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 319                 add_to_manifest(manifest, chunk_counter)
 320                 add_to_spine(spine, chunk_counter)
 321                 chunk_counter += 1
 322
 323         if children:
 324             for child in children:
 325                 child_xml = etree.parse(provider.by_uri(child))
 326                 child_toc, chunk_counter = transform_file(child_xml, chunk_counter, first=False)
 327                 toc.append(child_toc)
 328
 329         return toc, chunk_counter
 330
 331
 332     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 333
 334     # write static elements
 335     mime = zipfile.ZipInfo()
 336     mime.filename = 'mimetype'
 337     mime.compress_type = zipfile.ZIP_STORED
 338     mime.extra = ''
 339     zip.writestr(mime, 'application/epub+zip')
 340     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 341                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 342                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 343                        'media-type="application/oebps-package+xml" />' \
 344                        '</rootfiles></container>')
 345     for fname in 'style.css', 'logo_wolnelektury.png':
 346         zip.write(res(fname), os.path.join('OPS', fname))
 347
 348     # metadata from first file
 349     input_xml = etree.parse(provider[slug])
 350     metadata = input_xml.find('.//'+RDFNS('Description'))
 351     if metadata is None:
 352         raise NoDublinCore('Document has no DublinCore - which is required.')
 353     metadata = etree.ElementTree(metadata)
 354     opf = xslt(metadata, res('xsltContent.xsl'))
 355     manifest = opf.find('.//' + OPFNS('manifest'))
 356     spine = opf.find('.//' + OPFNS('spine'))
 357
 358     annotations = etree.Element('annotations')
 359
 360     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 361                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 362                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 363                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 364                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 365                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 366                                '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
 367                                '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
 368                                '</navPoint></navMap></ncx>')
 369     nav_map = toc_file[-1]
 370
 371     toc, chunk_counter = transform_file(input_xml)
 372     toc_counter = toc.write_to_xml(nav_map, 3) # we already have 2 navpoints
 373
 374     # Last modifications in container files and EPUB creation
 375     if len(annotations) > 0:
 376         nav_map.append(etree.fromstring(
 377             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 378             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 379         manifest.append(etree.fromstring(
 380             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 381         spine.append(etree.fromstring(
 382             '<itemref idref="annotations" />'))
 383         replace_by_verse(annotations)
 384         zip.writestr('OPS/annotations.html', etree.tostring(
 385                             xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
 386
 387     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 388     contents = []
 389     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 390     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 391     for st in attributes:
 392         meta = toc_file.makeelement(NCXNS('meta'))
 393         meta.set('name', st)
 394         meta.set('content', '0')
 395         toc_file[0].append(meta)
 396     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 397     toc_file[0][1].set('content', str(toc.depth()))
 398     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 399     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 400     zip.close()
 401
 402
 403 if __name__ == '__main__':
 404     if len(sys.argv) < 2:
 405         print >> sys.stderr, 'Usage: python epub.py <input file>'
 406         sys.exit(1)
 407
 408     main_input = sys.argv[1]
 409     basepath, ext = os.path.splitext(main_input)
 410     path, slug = os.path.realpath(basepath).rsplit('/', 1)
 411     output = basepath + '.epub'
 412     provider = DirDocProvider(path)
 413     transform(provider, slug, open(output, 'w'))
 414