librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import shutil
  11 import sys
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15
  16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS
  17
  18 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
  19 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
  20
  21
  22 def inner_xml(node):
  23     """ returns node's text and children as a string
  24
  25     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  26     x<b>y</b>z
  27     """
  28
  29     nt = node.text if node.text is not None else ''
  30     return ''.join([nt] + [etree.tostring(child) for child in node])
  31
  32 def set_inner_xml(node, text):
  33     """ sets node's text and children from a string
  34
  35     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  36     >>> set_inner_xml(e, 'x<b>y</b>z')
  37     >>> print etree.tostring(e)
  38     <a>x<b>y</b>z</a>
  39     """
  40
  41
  42     p = etree.fromstring('<x>%s</x>' % text)
  43     node.text = p.text
  44     node[:] = p[:]
  45
  46
  47 def node_name(node):
  48     """ Find out a node's name
  49
  50     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  51     XYZ
  52     """
  53
  54     tempnode = deepcopy(node)
  55
  56     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  57         for e in tempnode.findall('.//%s' % p):
  58             t = e.tail
  59             e.clear()
  60             e.tail = t
  61     etree.strip_tags(tempnode, '*')
  62     return tempnode.text
  63
  64
  65 def xslt(xml, sheet):
  66     if isinstance(xml, etree._Element):
  67         xml = etree.ElementTree(xml)
  68     with open(sheet) as xsltf:
  69         return xml.xslt(etree.parse(xsltf))
  70
  71
  72 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
  73 def res(fname):
  74     return os.path.join(_resdir, fname)
  75
  76
  77 def replace_characters(node):
  78     def replace_chars(text):
  79         if text is None:
  80             return None
  81         return text.replace("&", "&amp;")\
  82                    .replace("---", "&#8212;")\
  83                    .replace("--", "&#8211;")\
  84                    .replace(",,", "&#8222;")\
  85                    .replace('"', "&#8221;")\
  86                    .replace("'", "&#8217;")
  87     if node.tag == 'extra':
  88         node.clear()
  89     else:
  90         node.text = replace_chars(node.text)
  91         node.tail = replace_chars(node.tail)
  92         for child in node:
  93             replace_characters(child)
  94
  95
  96 def find_annotations(annotations, source, part_number):
  97     for child in source:
  98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  99             annotation = deepcopy(child)
 100             annotation.set('number', str(len(annotations)+1))
 101             annotation.set('part', str(part_number))
 102             annotation.tail = ''
 103             annotations.append(annotation)
 104             tail = child.tail
 105             child.clear()
 106             child.tail = tail
 107             child.text = str(len(annotations))
 108         if child.tag not in ('extra', 'podtytul'):
 109             find_annotations(annotations, child, part_number)
 110
 111
 112 def replace_by_verse(tree):
 113     """ Find stanzas and create new verses in place of a '/' character """
 114     stanzas = tree.findall('.//' + WLNS('strofa'))
 115     for node in stanzas:
 116         for child_node in node:
 117             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 118                 foreign_verses = inner_xml(child_node).split('/\n')
 119                 if len(foreign_verses) > 1:
 120                     new_foreign = ''
 121                     for foreign_verse in foreign_verses:
 122                         if foreign_verse.startswith('<wers'):
 123                             new_foreign += foreign_verse
 124                         else:
 125                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 126                     set_inner_xml(child_node, new_foreign)
 127         verses = inner_xml(node).split('/\n')
 128         if len(verses) > 1:
 129             modified_inner_xml = ''
 130             for verse in verses:
 131                 if verse.startswith('<wers') or verse.startswith('<extra'):
 132                     modified_inner_xml += verse
 133                 else:
 134                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 135             set_inner_xml(node, modified_inner_xml)
 136
 137
 138 def add_to_manifest(manifest, partno):
 139     """ Adds a node to the manifest section in content.opf file """
 140     partstr = 'part%d' % partno
 141     e = manifest.makeelement(OPFNS('item'), attrib={
 142                                  'id': partstr,
 143                                  'href': partstr + '.html',
 144                                  'media-type': 'application/xhtml+xml',
 145                              })
 146     manifest.append(e)
 147
 148
 149 def add_to_spine(spine, partno):
 150     """ Adds a node to the spine section in content.opf file """
 151     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 152     spine.append(e)
 153
 154
 155 def add_nav_point(nav_map, counter, title, part_counter):
 156     nav_point = nav_map.makeelement(NCXNS('navPoint'))
 157     nav_point.set('id', 'NavPoint-%d' % counter)
 158     nav_point.set('playOrder', str(counter))
 159
 160     nav_label = nav_map.makeelement(NCXNS('navLabel'))
 161     text = nav_map.makeelement(NCXNS('text'))
 162     text.text = title
 163     nav_label.append(text)
 164     nav_point.append(nav_label)
 165
 166     content = nav_map.makeelement(NCXNS('content'))
 167     content.set('src', 'part%d.html' % part_counter)
 168     nav_point.append(content)
 169
 170     nav_map.append(nav_point)
 171
 172
 173 def add_nav_point2(nav_map, counter, title, part_counter, subcounter):
 174     nav_point = nav_map.makeelement(NCXNS('navPoint'))
 175     nav_point.set('id', 'NavPoint-%d' % counter)
 176     nav_point.set('playOrder', str(counter))
 177
 178     nav_label = nav_map.makeelement(NCXNS('navLabel'))
 179     text = nav_map.makeelement(NCXNS('text'))
 180     text.text = title
 181     nav_label.append(text)
 182     nav_point.append(nav_label)
 183
 184     content = nav_map.makeelement(NCXNS('content'))
 185     content.set('src', 'part%d.html#sub%d' % (part_counter, subcounter))
 186     nav_point.append(content)
 187
 188     nav_map[-1].append(nav_point)
 189
 190
 191 def transform(input_file, output_file):
 192     """ produces an epub
 193
 194     input_file and output_file should be filelike objects
 195     """
 196
 197     input_xml = etree.parse(input_file)
 198
 199     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 200
 201     mime = zipfile.ZipInfo()
 202     mime.filename = 'mimetype'
 203     mime.compress_type = zipfile.ZIP_STORED
 204     mime.extra = ''
 205     zip.writestr(mime, 'application/epub+zip')
 206
 207     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 208                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 209                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 210                        'media-type="application/oebps-package+xml" />' \
 211                        '</rootfiles></container>')
 212
 213     metadata_el = input_xml.find('.//'+RDFNS('Description'))
 214     metadatasource = etree.ElementTree(metadata_el)
 215
 216     opf = xslt(metadatasource, res('xsltContent.xsl'))
 217
 218     manifest = opf.find('.//' + OPFNS('manifest'))
 219     spine = opf.find('.//' + OPFNS('spine'))
 220
 221     for fname in 'style.css', 'logo_wolnelektury.png':
 222         zip.write(res(fname), os.path.join('OPS', fname))
 223
 224     annotations = etree.Element('annotations')
 225     part_xml = etree.Element('utwor')
 226     etree.SubElement(part_xml, 'master')
 227
 228     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 229                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 230                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 231                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 232                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 233                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 234                                '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
 235                                '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
 236                                '</navPoint></navMap></ncx>')
 237
 238     main_xml_part = part_xml[0] # było [0][0], master
 239     nav_map = toc_file[-1] # było [-1][-1]
 240     depth = 1 # navmap
 241
 242     if len(input_xml.getroot()) > 1:
 243         # rdf before style master
 244         main_text = input_xml.getroot()[1]
 245     else:
 246         # rdf in style master
 247         main_text = input_xml.getroot()[0]
 248
 249     replace_characters(main_text)
 250     zip.writestr('OPS/title.html',
 251                  etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
 252
 253     # Search for table of contents elements and book division
 254
 255     stupid_i = stupid_j = stupid_k = 1
 256     last_node_part = False
 257     for one_part in main_text:
 258         name = one_part.tag
 259         if name in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 260             if name == "naglowek_czesc":
 261                 stupid_k = 1
 262                 last_node_part = True
 263                 find_annotations(annotations, part_xml, stupid_j)
 264                 replace_by_verse(part_xml)
 265                 zip.writestr('OPS/part%d.html' % stupid_j,
 266                             etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
 267                 main_xml_part[:] = [deepcopy(one_part)]
 268                 # add to manifest and spine
 269                 add_to_manifest(manifest, stupid_j)
 270                 add_to_spine(spine, stupid_j)
 271                 name_toc = node_name(one_part)
 272                 # build table of contents
 273                 # i+2 because of title page
 274                 add_nav_point(nav_map, stupid_i+2, name_toc, stupid_j + 1)
 275                 stupid_i += 1
 276                 stupid_j += 1
 277             else:
 278                 if last_node_part:
 279                     main_xml_part.append(one_part)
 280                     last_node_part = False
 281                     name_toc = node_name(one_part)
 282                     add_nav_point(nav_map, stupid_i + 1, name_toc, stupid_j)
 283                 else:
 284                     stupid_k = 1
 285                     find_annotations(annotations, part_xml, stupid_j)
 286                     replace_by_verse(part_xml)
 287                     zip.writestr('OPS/part%d.html' % stupid_j,
 288                                  etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
 289                     # start building a new part
 290                     main_xml_part[:] = [deepcopy(one_part)]
 291                     add_to_manifest(manifest, stupid_j)
 292                     add_to_spine(spine, stupid_j)
 293                     name_toc = node_name(one_part)
 294                     add_nav_point(nav_map, stupid_i + 2, name_toc, stupid_j + 1) # title page
 295                     stupid_j += 1
 296                     stupid_i += 1
 297         else:
 298             if name in ('naglowek_podrozdzial', 'naglowek_scena'):
 299                 depth = 2
 300                 name_toc =  node_name(one_part)
 301                 add_nav_point2(nav_map, stupid_i + 2, name_toc, stupid_j, stupid_k)
 302                 one_part.set('sub', str(stupid_k))
 303                 stupid_k += 1
 304                 stupid_i += 1
 305             main_xml_part.append(deepcopy(one_part))
 306             last_node_part = False
 307     find_annotations(annotations, part_xml, stupid_j)
 308     replace_by_verse(part_xml)
 309     add_to_manifest(manifest, stupid_j)
 310     add_to_spine(spine, stupid_j)
 311
 312     zip.writestr('OPS/part%d.html' % stupid_j,
 313                  etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
 314
 315     # Last modifications in container files and EPUB creation
 316     if len(annotations) > 0:
 317         nav_map.append(etree.fromstring(
 318             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 319             '</navLabel><content src="annotations.html" /></navPoint>' % {'i':stupid_i+2}))
 320         manifest.append(etree.fromstring(
 321             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 322         spine.append(etree.fromstring(
 323             '<itemref idref="annotations" />'))
 324         replace_by_verse(annotations)
 325         zip.writestr('OPS/annotations.html', etree.tostring(
 326                             xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
 327
 328     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 329     contents = []
 330     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 331     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 332     for st in attributes:
 333         meta = toc_file.makeelement(NCXNS('meta'))
 334         meta.set('name', st)
 335         meta.set('content', '0')
 336         toc_file[0].append(meta)
 337     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 338     toc_file[0][1].set('content', str(depth))
 339     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 340     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 341     zip.close()
 342
 343
 344 if __name__ == '__main__':
 345     import html
 346
 347     if len(sys.argv) < 2:
 348         print >> sys.stderr, 'Usage: python epub.py <input file> [output file]'
 349         sys.exit(1)
 350
 351     input = sys.argv[1]
 352     if len(sys.argv) > 2:
 353         output = sys.argv[2]
 354     else:
 355         basename, ext = os.path.splitext(input)
 356         output = basename + '.epub'
 357
 358     print input
 359     if html.transform(input, is_file=True) == '<empty />':
 360         print 'empty content - skipping'
 361     else:
 362         transform(open(input, 'r'), open(output, 'w'))
 363
 364
 365