librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp
  16 from shutil import rmtree
  17
  18 import sys
  19
  20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  21 from librarian.dcparser import BookInfo
  22
  23 from librarian import functions, get_resource
  24
  25 functions.reg_person_name()
  26
  27
  28 def inner_xml(node):
  29     """ returns node's text and children as a string
  30
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  32     x<b>y</b>z
  33     """
  34
  35     nt = node.text if node.text is not None else ''
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
  37
  38 def set_inner_xml(node, text):
  39     """ sets node's text and children from a string
  40
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
  43     >>> print etree.tostring(e)
  44     <a>x<b>y</b>z</a>
  45     """
  46
  47     p = etree.fromstring('<x>%s</x>' % text)
  48     node.text = p.text
  49     node[:] = p[:]
  50
  51
  52 def node_name(node):
  53     """ Find out a node's name
  54
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  56     XYZ
  57     """
  58
  59     tempnode = deepcopy(node)
  60
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  62         for e in tempnode.findall('.//%s' % p):
  63             t = e.tail
  64             e.clear()
  65             e.tail = t
  66     etree.strip_tags(tempnode, '*')
  67     return tempnode.text
  68
  69
  70 def xslt(xml, sheet):
  71     if isinstance(xml, etree._Element):
  72         xml = etree.ElementTree(xml)
  73     with open(sheet) as xsltf:
  74         return xml.xslt(etree.parse(xsltf))
  75
  76
  77 def replace_characters(node):
  78     def replace_chars(text):
  79         if text is None:
  80             return None
  81         return text.replace(u"\ufeff", u"")\
  82                    .replace("---", u"\u2014")\
  83                    .replace("--", u"\u2013")\
  84                    .replace(",,", u"\u201E")\
  85                    .replace('"', u"\u201D")\
  86                    .replace("'", u"\u2019")
  87     if node.tag == 'extra':
  88         node.clear()
  89     else:
  90         node.text = replace_chars(node.text)
  91         node.tail = replace_chars(node.tail)
  92         for child in node:
  93             replace_characters(child)
  94
  95
  96 def find_annotations(annotations, source, part_no):
  97     for child in source:
  98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  99             annotation = deepcopy(child)
 100             number = str(len(annotations)+1)
 101             annotation.set('number', number)
 102             annotation.set('part', str(part_no))
 103             annotation.tail = ''
 104             annotations.append(annotation)
 105             tail = child.tail
 106             child.clear()
 107             child.tail = tail
 108             child.text = number
 109         if child.tag not in ('extra',):
 110             find_annotations(annotations, child, part_no)
 111
 112
 113 def replace_by_verse(tree):
 114     """ Find stanzas and create new verses in place of a '/' character """
 115
 116     stanzas = tree.findall('.//' + WLNS('strofa'))
 117     for node in stanzas:
 118         for child_node in node:
 119             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 120                 foreign_verses = inner_xml(child_node).split('/\n')
 121                 if len(foreign_verses) > 1:
 122                     new_foreign = ''
 123                     for foreign_verse in foreign_verses:
 124                         if foreign_verse.startswith('<wers'):
 125                             new_foreign += foreign_verse
 126                         else:
 127                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 128                     set_inner_xml(child_node, new_foreign)
 129         verses = inner_xml(node).split('/\n')
 130         if len(verses) > 1:
 131             modified_inner_xml = ''
 132             for verse in verses:
 133                 if verse.startswith('<wers') or verse.startswith('<extra'):
 134                     modified_inner_xml += verse
 135                 else:
 136                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 137             set_inner_xml(node, modified_inner_xml)
 138
 139
 140 def add_to_manifest(manifest, partno):
 141     """ Adds a node to the manifest section in content.opf file """
 142
 143     partstr = 'part%d' % partno
 144     e = manifest.makeelement(OPFNS('item'), attrib={
 145                                  'id': partstr,
 146                                  'href': partstr + '.html',
 147                                  'media-type': 'application/xhtml+xml',
 148                              })
 149     manifest.append(e)
 150
 151
 152 def add_to_spine(spine, partno):
 153     """ Adds a node to the spine section in content.opf file """
 154
 155     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 156     spine.append(e)
 157
 158
 159 class TOC(object):
 160     def __init__(self, name=None, part_number=None):
 161         self.children = []
 162         self.name = name
 163         self.part_number = part_number
 164         self.sub_number = None
 165
 166     def add(self, name, part_number, level=0, is_part=True):
 167         if level > 0 and self.children:
 168             return self.children[-1].add(name, part_number, level-1, is_part)
 169         else:
 170             t = TOC(name)
 171             t.part_number = part_number
 172             self.children.append(t)
 173             if not is_part:
 174                 t.sub_number = len(self.children) + 1
 175                 return t.sub_number
 176
 177     def append(self, toc):
 178         self.children.append(toc)
 179
 180     def extend(self, toc):
 181         self.children.extend(toc.children)
 182
 183     def depth(self):
 184         if self.children:
 185             return max((c.depth() for c in self.children)) + 1
 186         else:
 187             return 0
 188
 189     def write_to_xml(self, nav_map, counter):
 190         for child in self.children:
 191             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 192             nav_point.set('id', 'NavPoint-%d' % counter)
 193             nav_point.set('playOrder', str(counter))
 194
 195             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 196             text = nav_map.makeelement(NCXNS('text'))
 197             text.text = child.name
 198             nav_label.append(text)
 199             nav_point.append(nav_label)
 200
 201             content = nav_map.makeelement(NCXNS('content'))
 202             src = 'part%d.html' % child.part_number
 203             if child.sub_number is not None:
 204                 src += '#sub%d' % child.sub_number
 205             content.set('src', src)
 206             nav_point.append(content)
 207             nav_map.append(nav_point)
 208             counter = child.write_to_xml(nav_point, counter + 1)
 209         return counter
 210
 211
 212 def used_chars(element):
 213     """ Lists characters used in an ETree Element """
 214     chars = set((element.text or '') + (element.tail or ''))
 215     for child in element:
 216         chars = chars.union(used_chars(child))
 217     return chars
 218
 219
 220 def chop(main_text):
 221     """ divide main content of the XML file into chunks """
 222
 223     # prepare a container for each chunk
 224     part_xml = etree.Element('utwor')
 225     etree.SubElement(part_xml, 'master')
 226     main_xml_part = part_xml[0] # master
 227
 228     last_node_part = False
 229     for one_part in main_text:
 230         name = one_part.tag
 231         if name == 'naglowek_czesc':
 232             yield part_xml
 233             last_node_part = True
 234             main_xml_part[:] = [deepcopy(one_part)]
 235         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 236             yield part_xml
 237             main_xml_part[:] = [deepcopy(one_part)]
 238         else:
 239             main_xml_part.append(deepcopy(one_part))
 240             last_node_part = False
 241     yield part_xml
 242
 243
 244 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 245     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 246
 247     toc = TOC()
 248     for element in chunk_xml[0]:
 249         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 250             toc.add(node_name(element), chunk_no)
 251         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 252             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 253             element.set('sub', str(subnumber))
 254     if empty:
 255         if not _empty_html_static:
 256             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 257         chars = set()
 258         output_html = _empty_html_static[0]
 259     else:
 260         find_annotations(annotations, chunk_xml, chunk_no)
 261         replace_by_verse(chunk_xml)
 262         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 263         chars = used_chars(html_tree.getroot())
 264         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 265     return output_html, toc, chars
 266
 267
 268 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None, cover_fn=None):
 269     """ produces a EPUB file
 270
 271     provider: a DocProvider
 272     slug: slug of file to process, available by provider
 273     output_file: file-like object or path to output file
 274     output_dir: path to directory to save output file to; either this or output_file must be present
 275     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 276     sample=n: generate sample e-book (with at least n paragraphs)
 277     cover_fn: function(author, title) -> cover image
 278     """
 279
 280     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
 281         """ processes one input file and proceeds to its children """
 282
 283         replace_characters(input_xml.getroot())
 284
 285         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 286
 287         # every input file will have a TOC entry,
 288         # pointing to starting chunk
 289         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 290         chars = set()
 291         if first:
 292             # write book title page
 293             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
 294             chars = used_chars(html_tree.getroot())
 295             zip.writestr('OPS/title.html',
 296                  etree.tostring(html_tree, method="html", pretty_print=True))
 297         elif children:
 298             # write title page for every parent
 299             if sample is not None and sample <= 0:
 300                 chars = set()
 301                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 302             else:
 303                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
 304                 chars = used_chars(html_tree.getroot())
 305                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 306             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 307             add_to_manifest(manifest, chunk_counter)
 308             add_to_spine(spine, chunk_counter)
 309             chunk_counter += 1
 310
 311         if len(input_xml.getroot()) > 1:
 312             # rdf before style master
 313             main_text = input_xml.getroot()[1]
 314         else:
 315             # rdf in style master
 316             main_text = input_xml.getroot()[0]
 317             if main_text.tag == RDFNS('RDF'):
 318                 main_text = None
 319
 320         if main_text is not None:
 321             for chunk_xml in chop(main_text):
 322                 empty = False
 323                 if sample is not None:
 324                     if sample <= 0:
 325                         empty = True
 326                     else:
 327                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 328                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 329
 330                 toc.extend(chunk_toc)
 331                 chars = chars.union(chunk_chars)
 332                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 333                 add_to_manifest(manifest, chunk_counter)
 334                 add_to_spine(spine, chunk_counter)
 335                 chunk_counter += 1
 336
 337         if children:
 338             for child in children:
 339                 child_xml = etree.parse(provider.by_uri(child))
 340                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
 341                 toc.append(child_toc)
 342                 chars = chars.union(chunk_chars)
 343
 344         return toc, chunk_counter, chars, sample
 345
 346     # read metadata from the first file
 347     if file_path:
 348         if slug:
 349             raise ValueError('slug or file_path should be specified, not both')
 350         f = open(file_path, 'r')
 351         input_xml = etree.parse(f)
 352         f.close()
 353     else:
 354         if not slug:
 355             raise ValueError('either slug or file_path should be specified')
 356         input_xml = etree.parse(provider[slug])
 357
 358     metadata = input_xml.find('.//'+RDFNS('Description'))
 359     if metadata is None:
 360         raise NoDublinCore('Document has no DublinCore - which is required.')
 361     book_info = BookInfo.from_element(input_xml)
 362     metadata = etree.ElementTree(metadata)
 363
 364     # if output to dir, create the file
 365     if output_dir is not None:
 366         if make_dir:
 367             author = unicode(book_info.author)
 368             output_dir = os.path.join(output_dir, author)
 369             try:
 370                 os.makedirs(output_dir)
 371             except OSError:
 372                 pass
 373         if slug:
 374             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 375         else:
 376             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 377
 378     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 379
 380     # write static elements
 381     mime = zipfile.ZipInfo()
 382     mime.filename = 'mimetype'
 383     mime.compress_type = zipfile.ZIP_STORED
 384     mime.extra = ''
 385     zip.writestr(mime, 'application/epub+zip')
 386     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 387                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 388                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 389                        'media-type="application/oebps-package+xml" />' \
 390                        '</rootfiles></container>')
 391     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
 392     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 393
 394     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
 395     manifest = opf.find('.//' + OPFNS('manifest'))
 396     spine = opf.find('.//' + OPFNS('spine'))
 397
 398     if cover_fn:
 399         cover = StringIO()
 400         cover_fn(book_info.author.readable(), book_info.title).save(cover, format='JPEG')
 401         zip.writestr(os.path.join('OPS', 'cover.jpg'), cover.getvalue())
 402         del cover
 403         zip.writestr('OPS/cover.html', open(get_resource('epub/cover.html')).read())
 404         manifest.append(etree.fromstring(
 405             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 406         manifest.append(etree.fromstring(
 407             '<item id="cover-image" href="cover.jpg" media-type="image/jpeg" />'))
 408         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
 409         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 410         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
 411
 412
 413     annotations = etree.Element('annotations')
 414
 415     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 416                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 417                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 418                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 419                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 420                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 421                                '</navPoint></navMap></ncx>')
 422     nav_map = toc_file[-1]
 423
 424     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
 425
 426     if not toc.children:
 427         toc.add(u"Początek utworu", 1)
 428     toc_counter = toc.write_to_xml(nav_map, 2)
 429
 430     # Last modifications in container files and EPUB creation
 431     if len(annotations) > 0:
 432         nav_map.append(etree.fromstring(
 433             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 434             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 435         toc_counter += 1
 436         manifest.append(etree.fromstring(
 437             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 438         spine.append(etree.fromstring(
 439             '<itemref idref="annotations" />'))
 440         replace_by_verse(annotations)
 441         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 442         chars = chars.union(used_chars(html_tree.getroot()))
 443         zip.writestr('OPS/annotations.html', etree.tostring(
 444                             html_tree, method="html", pretty_print=True))
 445
 446     nav_map.append(etree.fromstring(
 447         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
 448         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
 449     manifest.append(etree.fromstring(
 450         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 451     spine.append(etree.fromstring(
 452         '<itemref idref="last" />'))
 453     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
 454     chars.update(used_chars(html_tree.getroot()))
 455     zip.writestr('OPS/last.html', etree.tostring(
 456                         html_tree, method="html", pretty_print=True))
 457
 458     # strip fonts
 459     tmpdir = mkdtemp('-librarian-epub')
 460     cwd = os.getcwd()
 461
 462     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 463     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 464         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 465                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 466         if verbose:
 467             print "Running font-optimizer"
 468             subprocess.check_call(optimizer_call)
 469         else:
 470             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 471         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 472     rmtree(tmpdir)
 473     os.chdir(cwd)
 474
 475     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 476     contents = []
 477     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 478     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 479     for st in attributes:
 480         meta = toc_file.makeelement(NCXNS('meta'))
 481         meta.set('name', st)
 482         meta.set('content', '0')
 483         toc_file[0].append(meta)
 484     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 485     toc_file[0][1].set('content', str(toc.depth()))
 486     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 487     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 488     zip.close()