librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp, NamedTemporaryFile
  16 from shutil import rmtree
  17
  18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
  19 from librarian.cover import ImageCover as WLCover
  20
  21 from librarian import functions, get_resource
  22
  23 functions.reg_person_name()
  24
  25
  26 def inner_xml(node):
  27     """ returns node's text and children as a string
  28
  29     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  30     x<b>y</b>z
  31     """
  32
  33     nt = node.text if node.text is not None else ''
  34     return ''.join([nt] + [etree.tostring(child) for child in node])
  35
  36 def set_inner_xml(node, text):
  37     """ sets node's text and children from a string
  38
  39     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  40     >>> set_inner_xml(e, 'x<b>y</b>z')
  41     >>> print etree.tostring(e)
  42     <a>x<b>y</b>z</a>
  43     """
  44
  45     p = etree.fromstring('<x>%s</x>' % text)
  46     node.text = p.text
  47     node[:] = p[:]
  48
  49
  50 def node_name(node):
  51     """ Find out a node's name
  52
  53     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  54     XYZ
  55     """
  56
  57     tempnode = deepcopy(node)
  58
  59     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  60         for e in tempnode.findall('.//%s' % p):
  61             t = e.tail
  62             e.clear()
  63             e.tail = t
  64     etree.strip_tags(tempnode, '*')
  65     return tempnode.text
  66
  67
  68 def xslt(xml, sheet):
  69     if isinstance(xml, etree._Element):
  70         xml = etree.ElementTree(xml)
  71     with open(sheet) as xsltf:
  72         return xml.xslt(etree.parse(xsltf))
  73
  74
  75 def replace_characters(node):
  76     def replace_chars(text):
  77         if text is None:
  78             return None
  79         return text.replace(u"\ufeff", u"")\
  80                    .replace("---", u"\u2014")\
  81                    .replace("--", u"\u2013")\
  82                    .replace(",,", u"“")\
  83                    .replace('"', u"\u201D")\
  84                    .replace("'", u"\u2019")
  85     if node.tag in ('uwaga', 'extra'):
  86         t = node.tail
  87         node.clear()
  88         node.tail = t
  89     node.text = replace_chars(node.text)
  90     node.tail = replace_chars(node.tail)
  91     for child in node:
  92         replace_characters(child)
  93
  94
  95 def find_annotations(annotations, source, part_no):
  96     for child in source:
  97         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  98             annotation = deepcopy(child)
  99             number = str(len(annotations)+1)
 100             annotation.set('number', number)
 101             annotation.set('part', str(part_no))
 102             annotation.tail = ''
 103             annotations.append(annotation)
 104             tail = child.tail
 105             child.clear()
 106             child.tail = tail
 107             child.text = number
 108         if child.tag not in ('extra', 'uwaga'):
 109             find_annotations(annotations, child, part_no)
 110
 111
 112 def replace_by_verse(tree):
 113     """ Find stanzas and create new verses in place of a '/' character """
 114
 115     stanzas = tree.findall('.//' + WLNS('strofa'))
 116     for node in stanzas:
 117         for child_node in node:
 118             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 119                 foreign_verses = inner_xml(child_node).split('/\n')
 120                 if len(foreign_verses) > 1:
 121                     new_foreign = ''
 122                     for foreign_verse in foreign_verses:
 123                         if foreign_verse.startswith('<wers'):
 124                             new_foreign += foreign_verse
 125                         else:
 126                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 127                     set_inner_xml(child_node, new_foreign)
 128         verses = inner_xml(node).split('/\n')
 129         if len(verses) > 1:
 130             modified_inner_xml = ''
 131             for verse in verses:
 132                 if verse.startswith('<wers') or verse.startswith('<extra'):
 133                     modified_inner_xml += verse
 134                 else:
 135                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 136             set_inner_xml(node, modified_inner_xml)
 137
 138
 139 def add_to_manifest(manifest, partno):
 140     """ Adds a node to the manifest section in content.opf file """
 141
 142     partstr = 'part%d' % partno
 143     e = manifest.makeelement(OPFNS('item'), attrib={
 144                                  'id': partstr,
 145                                  'href': partstr + '.html',
 146                                  'media-type': 'application/xhtml+xml',
 147                              })
 148     manifest.append(e)
 149
 150
 151 def add_to_spine(spine, partno):
 152     """ Adds a node to the spine section in content.opf file """
 153
 154     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 155     spine.append(e)
 156
 157
 158 class TOC(object):
 159     def __init__(self, name=None, part_href=None):
 160         self.children = []
 161         self.name = name
 162         self.part_href = part_href
 163         self.sub_number = None
 164
 165     def add(self, name, part_href, level=0, is_part=True, index=None):
 166         assert level == 0 or index is None
 167         if level > 0 and self.children:
 168             return self.children[-1].add(name, part_href, level-1, is_part)
 169         else:
 170             t = TOC(name)
 171             t.part_href = part_href
 172             if index is not None:
 173                 self.children.insert(index, t)
 174             else:
 175                 self.children.append(t)
 176             if not is_part:
 177                 t.sub_number = len(self.children) + 1
 178                 return t.sub_number
 179
 180     def append(self, toc):
 181         self.children.append(toc)
 182
 183     def extend(self, toc):
 184         self.children.extend(toc.children)
 185
 186     def depth(self):
 187         if self.children:
 188             return max((c.depth() for c in self.children)) + 1
 189         else:
 190             return 0
 191
 192     def href(self):
 193         src = self.part_href
 194         if self.sub_number is not None:
 195             src += '#sub%d' % self.sub_number
 196         return src
 197
 198     def write_to_xml(self, nav_map, counter=1):
 199         for child in self.children:
 200             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 201             nav_point.set('id', 'NavPoint-%d' % counter)
 202             nav_point.set('playOrder', str(counter))
 203
 204             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 205             text = nav_map.makeelement(NCXNS('text'))
 206             text.text = child.name
 207             nav_label.append(text)
 208             nav_point.append(nav_label)
 209
 210             content = nav_map.makeelement(NCXNS('content'))
 211             content.set('src', child.href())
 212             nav_point.append(content)
 213             nav_map.append(nav_point)
 214             counter = child.write_to_xml(nav_point, counter + 1)
 215         return counter
 216
 217     def html_part(self, depth=0):
 218         texts = []
 219         for child in self.children:
 220             texts.append(
 221                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 222                 (depth, child.href(), child.name))
 223             texts.append(child.html_part(depth+1))
 224         return "\n".join(texts)
 225
 226     def html(self):
 227         with open(get_resource('epub/toc.html')) as f:
 228             t = unicode(f.read(), 'utf-8')
 229         return t % self.html_part()
 230
 231
 232 def used_chars(element):
 233     """ Lists characters used in an ETree Element """
 234     chars = set((element.text or '') + (element.tail or ''))
 235     for child in element:
 236         chars = chars.union(used_chars(child))
 237     return chars
 238
 239
 240 def chop(main_text):
 241     """ divide main content of the XML file into chunks """
 242
 243     # prepare a container for each chunk
 244     part_xml = etree.Element('utwor')
 245     etree.SubElement(part_xml, 'master')
 246     main_xml_part = part_xml[0] # master
 247
 248     last_node_part = False
 249     for one_part in main_text:
 250         name = one_part.tag
 251         #if name == 'naglowek_czesc':
 252         #    yield part_xml
 253         #    last_node_part = True
 254         #    main_xml_part[:] = [deepcopy(one_part)]
 255         #elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 256         #    yield part_xml
 257         #    main_xml_part[:] = [deepcopy(one_part)]
 258         #else:
 259         if True:
 260             main_xml_part.append(deepcopy(one_part))
 261             last_node_part = False
 262     yield part_xml
 263
 264
 265 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 266     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 267
 268     toc = TOC()
 269     #for element in chunk_xml[0]:
 270     #    if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 271     #        toc.add(node_name(element), "part%d.html" % chunk_no)
 272     #    elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 273     #        subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 274     #        element.set('sub', str(subnumber))
 275     if empty:
 276         if not _empty_html_static:
 277             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 278         chars = set()
 279         output_html = _empty_html_static[0]
 280     else:
 281         if chunk_no == 1:
 282             html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme-FoC.xsl'))
 283         else:
 284             find_annotations(annotations, chunk_xml, chunk_no)
 285             replace_by_verse(chunk_xml)
 286             html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 287         chars = used_chars(html_tree.getroot())
 288         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 289     return output_html, toc, chars
 290
 291
 292 def transform(wldoc, verbose=False,
 293               style=None, html_toc=False,
 294               sample=None, cover=None, flags=None):
 295     """ produces a EPUB file
 296
 297     sample=n: generate sample e-book (with at least n paragraphs)
 298     cover: a cover.Cover object or True for default
 299     flags: less-advertising, without-fonts, working-copy
 300     """
 301
 302     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 303         """ processes one input file and proceeds to its children """
 304
 305         replace_characters(wldoc.edoc.getroot())
 306
 307         # every input file will have a TOC entry,
 308         # pointing to starting chunk
 309
 310         # hack for FoC:
 311         if wldoc.book_info.author is not None:
 312             toc_title = "%s, %s" % (wldoc.book_info.author.readable(), wldoc.book_info.title)
 313             note = wldoc.edoc.find('//dzielo_nadrzedne')
 314             if note is not None:
 315                 toc_title += " (%s)" % note.text
 316         else:
 317             toc_title = wldoc.book_info.title
 318         toc = TOC(toc_title, "part%d.html" % chunk_counter)
 319         chars = set()
 320         if first:
 321             # write book title page
 322             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 323             chars = used_chars(html_tree.getroot())
 324             zip.writestr('OPS/title.html',
 325                  etree.tostring(html_tree, method="html", pretty_print=True))
 326             # add a title page TOC entry
 327             toc.add(u"Title page", "title.html")
 328             toc.add(u"Dear readers!", "part1.html")
 329         elif wldoc.book_info.parts:
 330             # write title page for every parent
 331             if sample is not None and sample <= 0:
 332                 chars = set()
 333                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 334             else:
 335                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 336                 chars = used_chars(html_tree.getroot())
 337                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 338             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 339             add_to_manifest(manifest, chunk_counter)
 340             add_to_spine(spine, chunk_counter)
 341             chunk_counter += 1
 342
 343         if len(wldoc.edoc.getroot()) > 1:
 344             # rdf before style master
 345             main_text = wldoc.edoc.getroot()[1]
 346         else:
 347             # rdf in style master
 348             main_text = wldoc.edoc.getroot()[0]
 349             if main_text.tag == RDFNS('RDF'):
 350                 main_text = None
 351
 352         if main_text is not None:
 353             for chunk_xml in chop(main_text):
 354                 empty = False
 355                 if sample is not None:
 356                     if sample <= 0:
 357                         empty = True
 358                     else:
 359                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 360                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 361
 362                 toc.extend(chunk_toc)
 363                 chars = chars.union(chunk_chars)
 364                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 365                 add_to_manifest(manifest, chunk_counter)
 366                 add_to_spine(spine, chunk_counter)
 367                 chunk_counter += 1
 368
 369         for child in wldoc.parts():
 370             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 371                 child, chunk_counter, first=False, sample=sample)
 372             toc.append(child_toc)
 373             chars = chars.union(chunk_chars)
 374
 375         return toc, chunk_counter, chars, sample
 376
 377
 378     document = deepcopy(wldoc)
 379     del wldoc
 380
 381     if flags:
 382         for flag in flags:
 383             document.edoc.getroot().set(flag, 'yes')
 384
 385     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 386     manifest = opf.find('.//' + OPFNS('manifest'))
 387     guide = opf.find('.//' + OPFNS('guide'))
 388     spine = opf.find('.//' + OPFNS('spine'))
 389
 390     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 391     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 392
 393     # write static elements
 394     mime = zipfile.ZipInfo()
 395     mime.filename = 'mimetype'
 396     mime.compress_type = zipfile.ZIP_STORED
 397     mime.extra = ''
 398     zip.writestr(mime, 'application/epub+zip')
 399     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 400                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 401                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 402                        'media-type="application/oebps-package+xml" />' \
 403                        '</rootfiles></container>')
 404     #zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 405     #zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 406     zip.write('logo.png', os.path.join('OPS', 'logo.png'))
 407     if not style:
 408         style = get_resource('epub/style.css')
 409     zip.write(style, os.path.join('OPS', 'style.css'))
 410
 411     if cover:
 412         if cover is True:
 413             cover = WLCover
 414         if cover.uses_dc_cover:
 415             if document.book_info.cover_by:
 416                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 417             if document.book_info.cover_source:
 418                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 419
 420         cover_file = StringIO()
 421         c = cover(document.book_info)
 422         import Image
 423         c.im = Image.open('cover.jpg')
 424         c.ext = lambda: 'jpg'
 425         c.save(cover_file)
 426         c_name = 'cover.%s' % c.ext()
 427         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 428         del cover_file
 429
 430         cover_tree = etree.parse(get_resource('epub/cover.html'))
 431         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 432         zip.writestr('OPS/cover.html', etree.tostring(
 433                         cover_tree, method="html", pretty_print=True))
 434
 435         manifest.append(etree.fromstring(
 436             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 437         manifest.append(etree.fromstring(
 438             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 439         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 440         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 441         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 442
 443
 444     annotations = etree.Element('annotations')
 445
 446     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 447                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 448                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 449                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 450                                '</navMap></ncx>')
 451     nav_map = toc_file[-1]
 452
 453     if html_toc:
 454         manifest.append(etree.fromstring(
 455             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 456         spine.append(etree.fromstring(
 457             '<itemref idref="html_toc" />'))
 458         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Table of Contents"/>'))
 459
 460     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 461
 462     if len(toc.children) < 2:
 463         toc.add(u"Początek utworu", "part1.html")
 464
 465     # Last modifications in container files and EPUB creation
 466     if len(annotations) > 0:
 467         toc.add("Przypisy", "annotations.html")
 468         manifest.append(etree.fromstring(
 469             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 470         spine.append(etree.fromstring(
 471             '<itemref idref="annotations" />'))
 472         replace_by_verse(annotations)
 473         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 474         chars = chars.union(used_chars(html_tree.getroot()))
 475         zip.writestr('OPS/annotations.html', etree.tostring(
 476                             html_tree, method="html", pretty_print=True))
 477
 478     toc.add("Editorial page", "last.html")
 479     manifest.append(etree.fromstring(
 480         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 481     spine.append(etree.fromstring(
 482         '<itemref idref="last" />'))
 483     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 484     chars.update(used_chars(html_tree.getroot()))
 485     zip.writestr('OPS/last.html', etree.tostring(
 486                         html_tree, method="html", pretty_print=True))
 487
 488     if not flags or not 'without-fonts' in flags:
 489         # strip fonts
 490         tmpdir = mkdtemp('-librarian-epub')
 491         cwd = os.getcwd()
 492
 493         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 494         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 495             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 496                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 497             if verbose:
 498                 print "Running font-optimizer"
 499                 subprocess.check_call(optimizer_call)
 500             else:
 501                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 502             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 503             manifest.append(etree.fromstring(
 504                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 505         rmtree(tmpdir)
 506         os.chdir(cwd)
 507
 508     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 509     title = document.book_info.title
 510     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 511     for st in attributes:
 512         meta = toc_file.makeelement(NCXNS('meta'))
 513         meta.set('name', st)
 514         meta.set('content', '0')
 515         toc_file[0].append(meta)
 516     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 517     toc_file[0][1].set('content', str(toc.depth()))
 518     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 519
 520     # write TOC
 521     if html_toc:
 522         toc.add(u"Table of Contents", "toc.html", index=1)
 523         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 524     toc.write_to_xml(nav_map)
 525     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 526     zip.close()
 527
 528     return OutputFile.from_filename(output_file.name)