librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp, NamedTemporaryFile
  16 from shutil import rmtree
  17
  18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
  19
  20 from librarian import functions, get_resource
  21
  22 functions.reg_person_name()
  23
  24
  25 def inner_xml(node):
  26     """ returns node's text and children as a string
  27
  28     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  29     x<b>y</b>z
  30     """
  31
  32     nt = node.text if node.text is not None else ''
  33     return ''.join([nt] + [etree.tostring(child) for child in node])
  34
  35 def set_inner_xml(node, text):
  36     """ sets node's text and children from a string
  37
  38     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  39     >>> set_inner_xml(e, 'x<b>y</b>z')
  40     >>> print etree.tostring(e)
  41     <a>x<b>y</b>z</a>
  42     """
  43
  44     p = etree.fromstring('<x>%s</x>' % text)
  45     node.text = p.text
  46     node[:] = p[:]
  47
  48
  49 def node_name(node):
  50     """ Find out a node's name
  51
  52     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  53     XYZ
  54     """
  55
  56     tempnode = deepcopy(node)
  57
  58     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  59         for e in tempnode.findall('.//%s' % p):
  60             t = e.tail
  61             e.clear()
  62             e.tail = t
  63     etree.strip_tags(tempnode, '*')
  64     return tempnode.text
  65
  66
  67 def xslt(xml, sheet):
  68     if isinstance(xml, etree._Element):
  69         xml = etree.ElementTree(xml)
  70     with open(sheet) as xsltf:
  71         return xml.xslt(etree.parse(xsltf))
  72
  73
  74 def replace_characters(node):
  75     def replace_chars(text):
  76         if text is None:
  77             return None
  78         return text.replace(u"\ufeff", u"")\
  79                    .replace("---", u"\u2014")\
  80                    .replace("--", u"\u2013")\
  81                    .replace(",,", u"\u201E")\
  82                    .replace('"', u"\u201D")\
  83                    .replace("'", u"\u2019")
  84     if node.tag in ('uwaga', 'extra'):
  85         t = node.tail
  86         node.clear()
  87         node.tail = t
  88     node.text = replace_chars(node.text)
  89     node.tail = replace_chars(node.tail)
  90     for child in node:
  91         replace_characters(child)
  92
  93
  94 def find_annotations(annotations, source, part_no):
  95     for child in source:
  96         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  97             annotation = deepcopy(child)
  98             number = str(len(annotations)+1)
  99             annotation.set('number', number)
 100             annotation.set('part', str(part_no))
 101             annotation.tail = ''
 102             annotations.append(annotation)
 103             tail = child.tail
 104             child.clear()
 105             child.tail = tail
 106             child.text = number
 107         if child.tag not in ('extra', 'uwaga'):
 108             find_annotations(annotations, child, part_no)
 109
 110
 111 def replace_by_verse(tree):
 112     """ Find stanzas and create new verses in place of a '/' character """
 113
 114     stanzas = tree.findall('.//' + WLNS('strofa'))
 115     for node in stanzas:
 116         for child_node in node:
 117             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 118                 foreign_verses = inner_xml(child_node).split('/\n')
 119                 if len(foreign_verses) > 1:
 120                     new_foreign = ''
 121                     for foreign_verse in foreign_verses:
 122                         if foreign_verse.startswith('<wers'):
 123                             new_foreign += foreign_verse
 124                         else:
 125                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 126                     set_inner_xml(child_node, new_foreign)
 127         verses = inner_xml(node).split('/\n')
 128         if len(verses) > 1:
 129             modified_inner_xml = ''
 130             for verse in verses:
 131                 if verse.startswith('<wers') or verse.startswith('<extra'):
 132                     modified_inner_xml += verse
 133                 else:
 134                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 135             set_inner_xml(node, modified_inner_xml)
 136
 137
 138 def add_to_manifest(manifest, partno):
 139     """ Adds a node to the manifest section in content.opf file """
 140
 141     partstr = 'part%d' % partno
 142     e = manifest.makeelement(OPFNS('item'), attrib={
 143                                  'id': partstr,
 144                                  'href': partstr + '.html',
 145                                  'media-type': 'application/xhtml+xml',
 146                              })
 147     manifest.append(e)
 148
 149
 150 def add_to_spine(spine, partno):
 151     """ Adds a node to the spine section in content.opf file """
 152
 153     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 154     spine.append(e)
 155
 156
 157 class TOC(object):
 158     def __init__(self, name=None, part_href=None):
 159         self.children = []
 160         self.name = name
 161         self.part_href = part_href
 162         self.sub_number = None
 163
 164     def add(self, name, part_href, level=0, is_part=True, index=None):
 165         assert level == 0 or index is None
 166         if level > 0 and self.children:
 167             return self.children[-1].add(name, part_href, level-1, is_part)
 168         else:
 169             t = TOC(name)
 170             t.part_href = part_href
 171             if index is not None:
 172                 self.children.insert(index, t)
 173             else:
 174                 self.children.append(t)
 175             if not is_part:
 176                 t.sub_number = len(self.children) + 1
 177                 return t.sub_number
 178
 179     def append(self, toc):
 180         self.children.append(toc)
 181
 182     def extend(self, toc):
 183         self.children.extend(toc.children)
 184
 185     def depth(self):
 186         if self.children:
 187             return max((c.depth() for c in self.children)) + 1
 188         else:
 189             return 0
 190
 191     def href(self):
 192         src = self.part_href
 193         if self.sub_number is not None:
 194             src += '#sub%d' % self.sub_number
 195         return src
 196
 197     def write_to_xml(self, nav_map, counter=1):
 198         for child in self.children:
 199             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 200             nav_point.set('id', 'NavPoint-%d' % counter)
 201             nav_point.set('playOrder', str(counter))
 202
 203             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 204             text = nav_map.makeelement(NCXNS('text'))
 205             text.text = child.name
 206             nav_label.append(text)
 207             nav_point.append(nav_label)
 208
 209             content = nav_map.makeelement(NCXNS('content'))
 210             content.set('src', child.href())
 211             nav_point.append(content)
 212             nav_map.append(nav_point)
 213             counter = child.write_to_xml(nav_point, counter + 1)
 214         return counter
 215
 216     def html_part(self, depth=0):
 217         texts = []
 218         for child in self.children:
 219             texts.append(
 220                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 221                 (depth, child.href(), child.name))
 222             texts.append(child.html_part(depth+1))
 223         return "\n".join(texts)
 224
 225     def html(self):
 226         with open(get_resource('epub/toc.html')) as f:
 227             t = unicode(f.read(), 'utf-8')
 228         return t % self.html_part()
 229
 230
 231 def used_chars(element):
 232     """ Lists characters used in an ETree Element """
 233     chars = set((element.text or '') + (element.tail or ''))
 234     for child in element:
 235         chars = chars.union(used_chars(child))
 236     return chars
 237
 238
 239 def chop(main_text):
 240     """ divide main content of the XML file into chunks """
 241
 242     # prepare a container for each chunk
 243     part_xml = etree.Element('utwor')
 244     etree.SubElement(part_xml, 'master')
 245     main_xml_part = part_xml[0] # master
 246
 247     last_node_part = False
 248     for one_part in main_text:
 249         name = one_part.tag
 250         if name == 'naglowek_czesc':
 251             yield part_xml
 252             last_node_part = True
 253             main_xml_part[:] = [deepcopy(one_part)]
 254         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 255             yield part_xml
 256             main_xml_part[:] = [deepcopy(one_part)]
 257         else:
 258             main_xml_part.append(deepcopy(one_part))
 259             last_node_part = False
 260     yield part_xml
 261
 262
 263 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 264     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 265
 266     toc = TOC()
 267     for element in chunk_xml[0]:
 268         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 269             toc.add(node_name(element), "part%d.html" % chunk_no)
 270         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 271             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 272             element.set('sub', str(subnumber))
 273     if empty:
 274         if not _empty_html_static:
 275             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 276         chars = set()
 277         output_html = _empty_html_static[0]
 278     else:
 279         find_annotations(annotations, chunk_xml, chunk_no)
 280         replace_by_verse(chunk_xml)
 281         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 282         chars = used_chars(html_tree.getroot())
 283         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 284     return output_html, toc, chars
 285
 286
 287 def transform(wldoc, verbose=False,
 288               style=None, html_toc=False,
 289               sample=None, cover=None, flags=None):
 290     """ produces a EPUB file
 291
 292     sample=n: generate sample e-book (with at least n paragraphs)
 293     cover: a cover.Cover object
 294     flags: less-advertising, without-fonts
 295     """
 296
 297     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 298         """ processes one input file and proceeds to its children """
 299
 300         replace_characters(wldoc.edoc.getroot())
 301
 302         # every input file will have a TOC entry,
 303         # pointing to starting chunk
 304         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 305         chars = set()
 306         if first:
 307             # write book title page
 308             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 309             chars = used_chars(html_tree.getroot())
 310             zip.writestr('OPS/title.html',
 311                  etree.tostring(html_tree, method="html", pretty_print=True))
 312             # add a title page TOC entry
 313             toc.add(u"Strona tytułowa", "title.html")
 314         elif wldoc.book_info.parts:
 315             # write title page for every parent
 316             if sample is not None and sample <= 0:
 317                 chars = set()
 318                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 319             else:
 320                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 321                 chars = used_chars(html_tree.getroot())
 322                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 323             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 324             add_to_manifest(manifest, chunk_counter)
 325             add_to_spine(spine, chunk_counter)
 326             chunk_counter += 1
 327
 328         if len(wldoc.edoc.getroot()) > 1:
 329             # rdf before style master
 330             main_text = wldoc.edoc.getroot()[1]
 331         else:
 332             # rdf in style master
 333             main_text = wldoc.edoc.getroot()[0]
 334             if main_text.tag == RDFNS('RDF'):
 335                 main_text = None
 336
 337         if main_text is not None:
 338             for chunk_xml in chop(main_text):
 339                 empty = False
 340                 if sample is not None:
 341                     if sample <= 0:
 342                         empty = True
 343                     else:
 344                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 345                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 346
 347                 toc.extend(chunk_toc)
 348                 chars = chars.union(chunk_chars)
 349                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 350                 add_to_manifest(manifest, chunk_counter)
 351                 add_to_spine(spine, chunk_counter)
 352                 chunk_counter += 1
 353
 354         for child in wldoc.parts():
 355             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 356                 child, chunk_counter, first=False, sample=sample)
 357             toc.append(child_toc)
 358             chars = chars.union(chunk_chars)
 359
 360         return toc, chunk_counter, chars, sample
 361
 362
 363     document = deepcopy(wldoc)
 364     del wldoc
 365
 366     if flags:
 367         for flag in flags:
 368             document.edoc.getroot().set(flag, 'yes')
 369
 370     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 371     manifest = opf.find('.//' + OPFNS('manifest'))
 372     guide = opf.find('.//' + OPFNS('guide'))
 373     spine = opf.find('.//' + OPFNS('spine'))
 374
 375     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 376     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 377
 378     # write static elements
 379     mime = zipfile.ZipInfo()
 380     mime.filename = 'mimetype'
 381     mime.compress_type = zipfile.ZIP_STORED
 382     mime.extra = ''
 383     zip.writestr(mime, 'application/epub+zip')
 384     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 385                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 386                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 387                        'media-type="application/oebps-package+xml" />' \
 388                        '</rootfiles></container>')
 389     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 390     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 391     if not style:
 392         style = get_resource('epub/style.css')
 393     zip.write(style, os.path.join('OPS', 'style.css'))
 394
 395
 396     if cover:
 397         cover_file = StringIO()
 398         c = cover(document.book_info.author.readable(), document.book_info.title)
 399         c.save(cover_file)
 400         c_name = 'cover.%s' % c.ext()
 401         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 402         del cover_file
 403
 404         cover_tree = etree.parse(get_resource('epub/cover.html'))
 405         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 406         zip.writestr('OPS/cover.html', etree.tostring(
 407                         cover_tree, method="html", pretty_print=True))
 408
 409         manifest.append(etree.fromstring(
 410             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 411         manifest.append(etree.fromstring(
 412             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 413         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 414         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 415         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 416
 417
 418     annotations = etree.Element('annotations')
 419
 420     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 421                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 422                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 423                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 424                                '</navMap></ncx>')
 425     nav_map = toc_file[-1]
 426
 427     if html_toc:
 428         manifest.append(etree.fromstring(
 429             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 430         spine.append(etree.fromstring(
 431             '<itemref idref="html_toc" />'))
 432         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
 433
 434     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 435
 436     if len(toc.children) < 2:
 437         toc.add(u"Początek utworu", "part1.html")
 438
 439     # Last modifications in container files and EPUB creation
 440     if len(annotations) > 0:
 441         toc.add("Przypisy", "annotations.html")
 442         manifest.append(etree.fromstring(
 443             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 444         spine.append(etree.fromstring(
 445             '<itemref idref="annotations" />'))
 446         replace_by_verse(annotations)
 447         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 448         chars = chars.union(used_chars(html_tree.getroot()))
 449         zip.writestr('OPS/annotations.html', etree.tostring(
 450                             html_tree, method="html", pretty_print=True))
 451
 452     toc.add("Strona redakcyjna", "last.html")
 453     manifest.append(etree.fromstring(
 454         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 455     spine.append(etree.fromstring(
 456         '<itemref idref="last" />'))
 457     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 458     chars.update(used_chars(html_tree.getroot()))
 459     zip.writestr('OPS/last.html', etree.tostring(
 460                         html_tree, method="html", pretty_print=True))
 461
 462     if not flags or not 'without-fonts' in flags:
 463         # strip fonts
 464         tmpdir = mkdtemp('-librarian-epub')
 465         cwd = os.getcwd()
 466
 467         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 468         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 469             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 470                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 471             if verbose:
 472                 print "Running font-optimizer"
 473                 subprocess.check_call(optimizer_call)
 474             else:
 475                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 476             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 477             manifest.append(etree.fromstring(
 478                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 479         rmtree(tmpdir)
 480         os.chdir(cwd)
 481
 482     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 483     title = document.book_info.title
 484     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 485     for st in attributes:
 486         meta = toc_file.makeelement(NCXNS('meta'))
 487         meta.set('name', st)
 488         meta.set('content', '0')
 489         toc_file[0].append(meta)
 490     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 491     toc_file[0][1].set('content', str(toc.depth()))
 492     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 493
 494     # write TOC
 495     if html_toc:
 496         toc.add(u"Spis treści", "toc.html", index=1)
 497         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 498     toc.write_to_xml(nav_map)
 499     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 500     zip.close()
 501
 502     return OutputFile.from_filename(output_file.name)