librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 from copy import deepcopy
  11 from lxml import etree
  12 import zipfile
  13
  14 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NoDublinCore
  15 from librarian.dcparser import BookInfo
  16
  17 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
  18 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  19
  20
  21 class DocProvider(object):
  22     class DoesNotExist(Exception):
  23         pass
  24
  25     def by_slug(self, slug):
  26         raise NotImplemented
  27
  28     def __getitem__(self, slug):
  29         return self.by_slug(slug)
  30
  31     def by_uri(self, uri):
  32         return self.by_slug(uri.rsplit('/', 1)[1])
  33
  34
  35 class DirDocProvider(DocProvider):
  36     def __init__(self, dir):
  37         self.dir = dir
  38         self.files = {}
  39
  40     def by_slug(self, slug):
  41         return open(os.path.join(self.dir, '%s.xml' % slug))
  42
  43
  44 def inner_xml(node):
  45     """ returns node's text and children as a string
  46
  47     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  48     x<b>y</b>z
  49     """
  50
  51     nt = node.text if node.text is not None else ''
  52     return ''.join([nt] + [etree.tostring(child) for child in node])
  53
  54 def set_inner_xml(node, text):
  55     """ sets node's text and children from a string
  56
  57     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  58     >>> set_inner_xml(e, 'x<b>y</b>z')
  59     >>> print etree.tostring(e)
  60     <a>x<b>y</b>z</a>
  61     """
  62
  63     p = etree.fromstring('<x>%s</x>' % text)
  64     node.text = p.text
  65     node[:] = p[:]
  66
  67
  68 def node_name(node):
  69     """ Find out a node's name
  70
  71     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  72     XYZ
  73     """
  74
  75     tempnode = deepcopy(node)
  76
  77     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  78         for e in tempnode.findall('.//%s' % p):
  79             t = e.tail
  80             e.clear()
  81             e.tail = t
  82     etree.strip_tags(tempnode, '*')
  83     return tempnode.text
  84
  85
  86 def xslt(xml, sheet):
  87     if isinstance(xml, etree._Element):
  88         xml = etree.ElementTree(xml)
  89     with open(sheet) as xsltf:
  90         return xml.xslt(etree.parse(xsltf))
  91
  92
  93 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
  94 def res(fname):
  95     return os.path.join(_resdir, fname)
  96
  97
  98 def replace_characters(node):
  99     def replace_chars(text):
 100         if text is None:
 101             return None
 102         return text.replace("---", u"\u2014")\
 103                    .replace("--", u"\u2013")\
 104                    .replace(",,", u"\u201E")\
 105                    .replace('"', u"\u201D")\
 106                    .replace("'", u"\u2019")
 107     if node.tag == 'extra':
 108         node.clear()
 109     else:
 110         node.text = replace_chars(node.text)
 111         node.tail = replace_chars(node.tail)
 112         for child in node:
 113             replace_characters(child)
 114
 115
 116 def find_annotations(annotations, source, part_no):
 117     for child in source:
 118         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 119             annotation = deepcopy(child)
 120             number = str(len(annotations)+1)
 121             annotation.set('number', number)
 122             annotation.set('part', str(part_no))
 123             annotation.tail = ''
 124             annotations.append(annotation)
 125             tail = child.tail
 126             child.clear()
 127             child.tail = tail
 128             child.text = number
 129         if child.tag not in ('extra', 'podtytul'):
 130             find_annotations(annotations, child, part_no)
 131
 132
 133 def replace_by_verse(tree):
 134     """ Find stanzas and create new verses in place of a '/' character """
 135
 136     stanzas = tree.findall('.//' + WLNS('strofa'))
 137     for node in stanzas:
 138         for child_node in node:
 139             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 140                 foreign_verses = inner_xml(child_node).split('/\n')
 141                 if len(foreign_verses) > 1:
 142                     new_foreign = ''
 143                     for foreign_verse in foreign_verses:
 144                         if foreign_verse.startswith('<wers'):
 145                             new_foreign += foreign_verse
 146                         else:
 147                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 148                     set_inner_xml(child_node, new_foreign)
 149         verses = inner_xml(node).split('/\n')
 150         if len(verses) > 1:
 151             modified_inner_xml = ''
 152             for verse in verses:
 153                 if verse.startswith('<wers') or verse.startswith('<extra'):
 154                     modified_inner_xml += verse
 155                 else:
 156                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 157             set_inner_xml(node, modified_inner_xml)
 158
 159
 160 def add_to_manifest(manifest, partno):
 161     """ Adds a node to the manifest section in content.opf file """
 162
 163     partstr = 'part%d' % partno
 164     e = manifest.makeelement(OPFNS('item'), attrib={
 165                                  'id': partstr,
 166                                  'href': partstr + '.html',
 167                                  'media-type': 'application/xhtml+xml',
 168                              })
 169     manifest.append(e)
 170
 171
 172 def add_to_spine(spine, partno):
 173     """ Adds a node to the spine section in content.opf file """
 174
 175     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 176     spine.append(e)
 177
 178
 179 class TOC(object):
 180     def __init__(self, name=None, part_number=None):
 181         self.children = []
 182         self.name = name
 183         self.part_number = part_number
 184         self.sub_number = None
 185
 186     def add(self, name, part_number, level=0, is_part=True):
 187         if level > 0 and self.children:
 188             return self.children[-1].add(name, part_number, level-1, is_part)
 189         else:
 190             t = TOC(name)
 191             t.part_number = part_number
 192             self.children.append(t)
 193             if not is_part:
 194                 t.sub_number = len(self.children) + 1
 195                 return t.sub_number
 196
 197     def append(self, toc):
 198         self.children.append(toc)
 199
 200     def extend(self, toc):
 201         self.children.extend(toc.children)
 202
 203     def depth(self):
 204         if self.children:
 205             return max((c.depth() for c in self.children)) + 1
 206         else:
 207             return 0
 208
 209     def write_to_xml(self, nav_map, counter):
 210         for child in self.children:
 211             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 212             nav_point.set('id', 'NavPoint-%d' % counter)
 213             nav_point.set('playOrder', str(counter))
 214
 215             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 216             text = nav_map.makeelement(NCXNS('text'))
 217             text.text = child.name
 218             nav_label.append(text)
 219             nav_point.append(nav_label)
 220
 221             content = nav_map.makeelement(NCXNS('content'))
 222             src = 'part%d.html' % child.part_number
 223             if child.sub_number is not None:
 224                 src += '#sub%d' % child.sub_number
 225             content.set('src', src)
 226             nav_point.append(content)
 227             nav_map.append(nav_point)
 228             counter = child.write_to_xml(nav_point, counter + 1)
 229         return counter
 230
 231
 232 def chop(main_text):
 233     """ divide main content of the XML file into chunks """
 234
 235     # prepare a container for each chunk
 236     part_xml = etree.Element('utwor')
 237     etree.SubElement(part_xml, 'master')
 238     main_xml_part = part_xml[0] # master
 239
 240     last_node_part = False
 241     for one_part in main_text:
 242         name = one_part.tag
 243         if name == 'naglowek_czesc':
 244             yield part_xml
 245             last_node_part = True
 246             main_xml_part[:] = [deepcopy(one_part)]
 247         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 248             yield part_xml
 249             main_xml_part[:] = [deepcopy(one_part)]
 250         else:
 251             main_xml_part.append(deepcopy(one_part))
 252             last_node_part = False
 253     yield part_xml
 254
 255
 256 def transform_chunk(chunk_xml, chunk_no, annotations):
 257     """ transforms one chunk, returns a HTML string and a TOC object """
 258
 259     toc = TOC()
 260     for element in chunk_xml[0]:
 261         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 262             toc.add(node_name(element), chunk_no)
 263         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 264             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 265             element.set('sub', str(subnumber))
 266     find_annotations(annotations, chunk_xml, chunk_no)
 267     replace_by_verse(chunk_xml)
 268     output_html = etree.tostring(xslt(chunk_xml, res('xsltScheme.xsl')), pretty_print=True)
 269     return output_html, toc
 270
 271
 272 def transform(provider, slug, output_file=None, output_dir=None):
 273     """ produces an epub
 274
 275     provider is a DocProvider
 276     either output_file (a file-like object) or output_dir (path to file/dir) should be specified
 277     if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
 278     """
 279
 280     def transform_file(input_xml, chunk_counter=1, first=True):
 281         """ processes one input file and proceeds to its children """
 282
 283         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 284
 285         # every input file will have a TOC entry,
 286         # pointing to starting chunk
 287         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 288         if first:
 289             # write book title page
 290             zip.writestr('OPS/title.html',
 291                  etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
 292         elif children:
 293             # write title page for every parent
 294             zip.writestr('OPS/part%d.html' % chunk_counter,
 295                 etree.tostring(xslt(input_xml, res('xsltChunkTitle.xsl')), pretty_print=True))
 296             add_to_manifest(manifest, chunk_counter)
 297             add_to_spine(spine, chunk_counter)
 298             chunk_counter += 1
 299
 300         if len(input_xml.getroot()) > 1:
 301             # rdf before style master
 302             main_text = input_xml.getroot()[1]
 303         else:
 304             # rdf in style master
 305             main_text = input_xml.getroot()[0]
 306             if main_text.tag == RDFNS('RDF'):
 307                 main_text = None
 308
 309         if main_text is not None:
 310             replace_characters(main_text)
 311
 312             for chunk_no, chunk_xml in enumerate(chop(main_text), chunk_counter):
 313                 chunk_html, chunk_toc = transform_chunk(chunk_xml, chunk_counter, annotations)
 314                 toc.extend(chunk_toc)
 315                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 316                 add_to_manifest(manifest, chunk_counter)
 317                 add_to_spine(spine, chunk_counter)
 318                 chunk_counter += 1
 319
 320         if children:
 321             for child in children:
 322                 child_xml = etree.parse(provider.by_uri(child))
 323                 child_toc, chunk_counter = transform_file(child_xml, chunk_counter, first=False)
 324                 toc.append(child_toc)
 325
 326         return toc, chunk_counter
 327
 328     # read metadata from the first file
 329     input_xml = etree.parse(provider[slug])
 330     metadata = input_xml.find('.//'+RDFNS('Description'))
 331     if metadata is None:
 332         raise NoDublinCore('Document has no DublinCore - which is required.')
 333     book_info = BookInfo.from_element(input_xml)
 334     metadata = etree.ElementTree(metadata)
 335
 336     # if output to dir, create the file
 337     if output_dir is not None:
 338         author = unicode(book_info.author)
 339         author_dir = os.path.join(output_dir, author)
 340         try:
 341             os.makedirs(author_dir)
 342         except OSError:
 343             pass
 344         output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
 345
 346
 347     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 348
 349     # write static elements
 350     mime = zipfile.ZipInfo()
 351     mime.filename = 'mimetype'
 352     mime.compress_type = zipfile.ZIP_STORED
 353     mime.extra = ''
 354     zip.writestr(mime, 'application/epub+zip')
 355     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 356                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 357                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 358                        'media-type="application/oebps-package+xml" />' \
 359                        '</rootfiles></container>')
 360     for fname in 'style.css', 'logo_wolnelektury.png':
 361         zip.write(res(fname), os.path.join('OPS', fname))
 362
 363     opf = xslt(metadata, res('xsltContent.xsl'))
 364     manifest = opf.find('.//' + OPFNS('manifest'))
 365     spine = opf.find('.//' + OPFNS('spine'))
 366
 367     annotations = etree.Element('annotations')
 368
 369     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 370                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 371                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 372                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 373                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 374                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 375                                '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
 376                                '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
 377                                '</navPoint></navMap></ncx>')
 378     nav_map = toc_file[-1]
 379
 380     toc, chunk_counter = transform_file(input_xml)
 381     toc_counter = toc.write_to_xml(nav_map, 3) # we already have 2 navpoints
 382
 383     # Last modifications in container files and EPUB creation
 384     if len(annotations) > 0:
 385         nav_map.append(etree.fromstring(
 386             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 387             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 388         manifest.append(etree.fromstring(
 389             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 390         spine.append(etree.fromstring(
 391             '<itemref idref="annotations" />'))
 392         replace_by_verse(annotations)
 393         zip.writestr('OPS/annotations.html', etree.tostring(
 394                             xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
 395
 396     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 397     contents = []
 398     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 399     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 400     for st in attributes:
 401         meta = toc_file.makeelement(NCXNS('meta'))
 402         meta.set('name', st)
 403         meta.set('content', '0')
 404         toc_file[0].append(meta)
 405     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 406     toc_file[0][1].set('content', str(toc.depth()))
 407     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 408     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 409     zip.close()
 410
 411
 412 if __name__ == '__main__':
 413     import sys
 414
 415     if len(sys.argv) < 2:
 416         print >> sys.stderr, 'Usage: python epub.py <input file>'
 417         sys.exit(1)
 418
 419     main_input = sys.argv[1]
 420     basepath, ext = os.path.splitext(main_input)
 421     path, slug = os.path.realpath(basepath).rsplit('/', 1)
 422     provider = DirDocProvider(path)
 423     transform(provider, slug, output_dir=path)
 424