librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp
  16 from shutil import rmtree
  17
  18 import sys
  19
  20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
  21 from librarian.dcparser import BookInfo
  22
  23 from librarian import functions, get_resource
  24
  25 functions.reg_person_name()
  26
  27
  28 def inner_xml(node):
  29     """ returns node's text and children as a string
  30
  31     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  32     x<b>y</b>z
  33     """
  34
  35     nt = node.text if node.text is not None else ''
  36     return ''.join([nt] + [etree.tostring(child) for child in node])
  37
  38 def set_inner_xml(node, text):
  39     """ sets node's text and children from a string
  40
  41     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  42     >>> set_inner_xml(e, 'x<b>y</b>z')
  43     >>> print etree.tostring(e)
  44     <a>x<b>y</b>z</a>
  45     """
  46
  47     p = etree.fromstring('<x>%s</x>' % text)
  48     node.text = p.text
  49     node[:] = p[:]
  50
  51
  52 def node_name(node):
  53     """ Find out a node's name
  54
  55     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  56     XYZ
  57     """
  58
  59     tempnode = deepcopy(node)
  60
  61     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  62         for e in tempnode.findall('.//%s' % p):
  63             t = e.tail
  64             e.clear()
  65             e.tail = t
  66     etree.strip_tags(tempnode, '*')
  67     return tempnode.text
  68
  69
  70 def xslt(xml, sheet):
  71     if isinstance(xml, etree._Element):
  72         xml = etree.ElementTree(xml)
  73     with open(sheet) as xsltf:
  74         return xml.xslt(etree.parse(xsltf))
  75
  76
  77 def replace_characters(node):
  78     def replace_chars(text):
  79         if text is None:
  80             return None
  81         return text.replace(u"\ufeff", u"")\
  82                    .replace("---", u"\u2014")\
  83                    .replace("--", u"\u2013")\
  84                    .replace(",,", u"\u201E")\
  85                    .replace('"', u"\u201D")\
  86                    .replace("'", u"\u2019")
  87     if node.tag in ('uwaga', 'extra'):
  88         t = node.tail
  89         node.clear()
  90         node.tail = t
  91     node.text = replace_chars(node.text)
  92     node.tail = replace_chars(node.tail)
  93     for child in node:
  94         replace_characters(child)
  95
  96
  97 def find_annotations(annotations, source, part_no):
  98     for child in source:
  99         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 100             annotation = deepcopy(child)
 101             number = str(len(annotations)+1)
 102             annotation.set('number', number)
 103             annotation.set('part', str(part_no))
 104             annotation.tail = ''
 105             annotations.append(annotation)
 106             tail = child.tail
 107             child.clear()
 108             child.tail = tail
 109             child.text = number
 110         if child.tag not in ('extra', 'uwaga'):
 111             find_annotations(annotations, child, part_no)
 112
 113
 114 def replace_by_verse(tree):
 115     """ Find stanzas and create new verses in place of a '/' character """
 116
 117     stanzas = tree.findall('.//' + WLNS('strofa'))
 118     for node in stanzas:
 119         for child_node in node:
 120             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 121                 foreign_verses = inner_xml(child_node).split('/\n')
 122                 if len(foreign_verses) > 1:
 123                     new_foreign = ''
 124                     for foreign_verse in foreign_verses:
 125                         if foreign_verse.startswith('<wers'):
 126                             new_foreign += foreign_verse
 127                         else:
 128                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 129                     set_inner_xml(child_node, new_foreign)
 130         verses = inner_xml(node).split('/\n')
 131         if len(verses) > 1:
 132             modified_inner_xml = ''
 133             for verse in verses:
 134                 if verse.startswith('<wers') or verse.startswith('<extra'):
 135                     modified_inner_xml += verse
 136                 else:
 137                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 138             set_inner_xml(node, modified_inner_xml)
 139
 140
 141 def add_to_manifest(manifest, partno):
 142     """ Adds a node to the manifest section in content.opf file """
 143
 144     partstr = 'part%d' % partno
 145     e = manifest.makeelement(OPFNS('item'), attrib={
 146                                  'id': partstr,
 147                                  'href': partstr + '.html',
 148                                  'media-type': 'application/xhtml+xml',
 149                              })
 150     manifest.append(e)
 151
 152
 153 def add_to_spine(spine, partno):
 154     """ Adds a node to the spine section in content.opf file """
 155
 156     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 157     spine.append(e)
 158
 159
 160 class TOC(object):
 161     def __init__(self, name=None, part_number=None):
 162         self.children = []
 163         self.name = name
 164         self.part_number = part_number
 165         self.sub_number = None
 166
 167     def add(self, name, part_number, level=0, is_part=True):
 168         if level > 0 and self.children:
 169             return self.children[-1].add(name, part_number, level-1, is_part)
 170         else:
 171             t = TOC(name)
 172             t.part_number = part_number
 173             self.children.append(t)
 174             if not is_part:
 175                 t.sub_number = len(self.children) + 1
 176                 return t.sub_number
 177
 178     def append(self, toc):
 179         self.children.append(toc)
 180
 181     def extend(self, toc):
 182         self.children.extend(toc.children)
 183
 184     def depth(self):
 185         if self.children:
 186             return max((c.depth() for c in self.children)) + 1
 187         else:
 188             return 0
 189
 190     def write_to_xml(self, nav_map, counter):
 191         for child in self.children:
 192             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 193             nav_point.set('id', 'NavPoint-%d' % counter)
 194             nav_point.set('playOrder', str(counter))
 195
 196             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 197             text = nav_map.makeelement(NCXNS('text'))
 198             text.text = child.name
 199             nav_label.append(text)
 200             nav_point.append(nav_label)
 201
 202             content = nav_map.makeelement(NCXNS('content'))
 203             src = 'part%d.html' % child.part_number
 204             if child.sub_number is not None:
 205                 src += '#sub%d' % child.sub_number
 206             content.set('src', src)
 207             nav_point.append(content)
 208             nav_map.append(nav_point)
 209             counter = child.write_to_xml(nav_point, counter + 1)
 210         return counter
 211
 212
 213 def used_chars(element):
 214     """ Lists characters used in an ETree Element """
 215     chars = set((element.text or '') + (element.tail or ''))
 216     for child in element:
 217         chars = chars.union(used_chars(child))
 218     return chars
 219
 220
 221 def chop(main_text):
 222     """ divide main content of the XML file into chunks """
 223
 224     # prepare a container for each chunk
 225     part_xml = etree.Element('utwor')
 226     etree.SubElement(part_xml, 'master')
 227     main_xml_part = part_xml[0] # master
 228
 229     last_node_part = False
 230     for one_part in main_text:
 231         name = one_part.tag
 232         if name == 'naglowek_czesc':
 233             yield part_xml
 234             last_node_part = True
 235             main_xml_part[:] = [deepcopy(one_part)]
 236         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 237             yield part_xml
 238             main_xml_part[:] = [deepcopy(one_part)]
 239         else:
 240             main_xml_part.append(deepcopy(one_part))
 241             last_node_part = False
 242     yield part_xml
 243
 244
 245 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 246     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 247
 248     toc = TOC()
 249     for element in chunk_xml[0]:
 250         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 251             toc.add(node_name(element), chunk_no)
 252         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 253             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
 254             element.set('sub', str(subnumber))
 255     if empty:
 256         if not _empty_html_static:
 257             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 258         chars = set()
 259         output_html = _empty_html_static[0]
 260     else:
 261         find_annotations(annotations, chunk_xml, chunk_no)
 262         replace_by_verse(chunk_xml)
 263         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 264         chars = used_chars(html_tree.getroot())
 265         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 266     return output_html, toc, chars
 267
 268
 269 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
 270               sample=None, cover=None, flags=None):
 271     """ produces a EPUB file
 272
 273     provider: a DocProvider
 274     slug: slug of file to process, available by provider
 275     output_file: file-like object or path to output file
 276     output_dir: path to directory to save output file to; either this or output_file must be present
 277     make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
 278     sample=n: generate sample e-book (with at least n paragraphs)
 279     cover: a cover.Cover object
 280     flags: less-advertising,
 281     """
 282
 283     def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
 284         """ processes one input file and proceeds to its children """
 285
 286         replace_characters(input_xml.getroot())
 287
 288         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
 289
 290         # every input file will have a TOC entry,
 291         # pointing to starting chunk
 292         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
 293         chars = set()
 294         if first:
 295             # write book title page
 296             html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
 297             chars = used_chars(html_tree.getroot())
 298             zip.writestr('OPS/title.html',
 299                  etree.tostring(html_tree, method="html", pretty_print=True))
 300         elif children:
 301             # write title page for every parent
 302             if sample is not None and sample <= 0:
 303                 chars = set()
 304                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 305             else:
 306                 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
 307                 chars = used_chars(html_tree.getroot())
 308                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 309             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 310             add_to_manifest(manifest, chunk_counter)
 311             add_to_spine(spine, chunk_counter)
 312             chunk_counter += 1
 313
 314         if len(input_xml.getroot()) > 1:
 315             # rdf before style master
 316             main_text = input_xml.getroot()[1]
 317         else:
 318             # rdf in style master
 319             main_text = input_xml.getroot()[0]
 320             if main_text.tag == RDFNS('RDF'):
 321                 main_text = None
 322
 323         if main_text is not None:
 324             for chunk_xml in chop(main_text):
 325                 empty = False
 326                 if sample is not None:
 327                     if sample <= 0:
 328                         empty = True
 329                     else:
 330                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 331                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 332
 333                 toc.extend(chunk_toc)
 334                 chars = chars.union(chunk_chars)
 335                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 336                 add_to_manifest(manifest, chunk_counter)
 337                 add_to_spine(spine, chunk_counter)
 338                 chunk_counter += 1
 339
 340         if children:
 341             for child in children:
 342                 child_xml = etree.parse(provider.by_uri(child))
 343                 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
 344                 toc.append(child_toc)
 345                 chars = chars.union(chunk_chars)
 346
 347         return toc, chunk_counter, chars, sample
 348
 349     # read metadata from the first file
 350     if file_path:
 351         if slug:
 352             raise ValueError('slug or file_path should be specified, not both')
 353         f = open(file_path, 'r')
 354         input_xml = etree.parse(f)
 355         f.close()
 356     else:
 357         if not slug:
 358             raise ValueError('either slug or file_path should be specified')
 359         input_xml = etree.parse(provider[slug])
 360
 361     if flags:
 362         for flag in flags:
 363             input_xml.getroot().set(flag, 'yes')
 364
 365     metadata = input_xml.find('.//'+RDFNS('Description'))
 366     if metadata is None:
 367         raise NoDublinCore('Document has no DublinCore - which is required.')
 368     book_info = BookInfo.from_element(input_xml)
 369     metadata = etree.ElementTree(metadata)
 370
 371     # if output to dir, create the file
 372     if output_dir is not None:
 373         if make_dir:
 374             author = unicode(book_info.author)
 375             output_dir = os.path.join(output_dir, author)
 376             try:
 377                 os.makedirs(output_dir)
 378             except OSError:
 379                 pass
 380         if slug:
 381             output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 382         else:
 383             output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
 384
 385     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 386
 387     # write static elements
 388     mime = zipfile.ZipInfo()
 389     mime.filename = 'mimetype'
 390     mime.compress_type = zipfile.ZIP_STORED
 391     mime.extra = ''
 392     zip.writestr(mime, 'application/epub+zip')
 393     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 394                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 395                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 396                        'media-type="application/oebps-package+xml" />' \
 397                        '</rootfiles></container>')
 398     zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
 399     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 400
 401     opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
 402     manifest = opf.find('.//' + OPFNS('manifest'))
 403     spine = opf.find('.//' + OPFNS('spine'))
 404
 405     if cover:
 406         cover_file = StringIO()
 407         c = cover(book_info.author.readable(), book_info.title)
 408         c.save(cover_file)
 409         c_name = 'cover.%s' % c.ext()
 410         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 411         del cover_file
 412
 413         cover_tree = etree.parse(get_resource('epub/cover.html'))
 414         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 415         zip.writestr('OPS/cover.html', etree.tostring(
 416                         cover_tree, method="html", pretty_print=True))
 417
 418         manifest.append(etree.fromstring(
 419             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 420         manifest.append(etree.fromstring(
 421             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 422         spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
 423         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 424         opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
 425
 426
 427     annotations = etree.Element('annotations')
 428
 429     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 430                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 431                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 432                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 433                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
 434                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
 435                                '</navPoint></navMap></ncx>')
 436     nav_map = toc_file[-1]
 437
 438     toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
 439
 440     if not toc.children:
 441         toc.add(u"Początek utworu", 1)
 442     toc_counter = toc.write_to_xml(nav_map, 2)
 443
 444     # Last modifications in container files and EPUB creation
 445     if len(annotations) > 0:
 446         nav_map.append(etree.fromstring(
 447             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
 448             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
 449         toc_counter += 1
 450         manifest.append(etree.fromstring(
 451             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 452         spine.append(etree.fromstring(
 453             '<itemref idref="annotations" />'))
 454         replace_by_verse(annotations)
 455         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 456         chars = chars.union(used_chars(html_tree.getroot()))
 457         zip.writestr('OPS/annotations.html', etree.tostring(
 458                             html_tree, method="html", pretty_print=True))
 459
 460     nav_map.append(etree.fromstring(
 461         '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
 462         '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
 463     manifest.append(etree.fromstring(
 464         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 465     spine.append(etree.fromstring(
 466         '<itemref idref="last" />'))
 467     html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
 468     chars.update(used_chars(html_tree.getroot()))
 469     zip.writestr('OPS/last.html', etree.tostring(
 470                         html_tree, method="html", pretty_print=True))
 471
 472     # strip fonts
 473     tmpdir = mkdtemp('-librarian-epub')
 474     cwd = os.getcwd()
 475
 476     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 477     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 478         optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 479                           get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 480         if verbose:
 481             print "Running font-optimizer"
 482             subprocess.check_call(optimizer_call)
 483         else:
 484             subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 485         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 486     rmtree(tmpdir)
 487     os.chdir(cwd)
 488
 489     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 490     contents = []
 491     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
 492     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 493     for st in attributes:
 494         meta = toc_file.makeelement(NCXNS('meta'))
 495         meta.set('name', st)
 496         meta.set('content', '0')
 497         toc_file[0].append(meta)
 498     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 499     toc_file[0][1].set('content', str(toc.depth()))
 500     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 501     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 502     zip.close()