librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 import os
   9 import os.path
  10 import subprocess
  11 from StringIO import StringIO
  12 from copy import deepcopy
  13 from lxml import etree
  14 import zipfile
  15 from tempfile import mkdtemp, NamedTemporaryFile
  16 from shutil import rmtree
  17
  18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
  19 from librarian.cover import WLCover
  20
  21 from librarian import functions, get_resource
  22
  23 functions.reg_person_name()
  24
  25
  26 def inner_xml(node):
  27     """ returns node's text and children as a string
  28
  29     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  30     x<b>y</b>z
  31     """
  32
  33     nt = node.text if node.text is not None else ''
  34     return ''.join([nt] + [etree.tostring(child) for child in node])
  35
  36 def set_inner_xml(node, text):
  37     """ sets node's text and children from a string
  38
  39     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  40     >>> set_inner_xml(e, 'x<b>y</b>z')
  41     >>> print etree.tostring(e)
  42     <a>x<b>y</b>z</a>
  43     """
  44
  45     p = etree.fromstring('<x>%s</x>' % text)
  46     node.text = p.text
  47     node[:] = p[:]
  48
  49
  50 def node_name(node):
  51     """ Find out a node's name
  52
  53     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  54     XYZ
  55     """
  56
  57     tempnode = deepcopy(node)
  58
  59     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  60         for e in tempnode.findall('.//%s' % p):
  61             t = e.tail
  62             e.clear()
  63             e.tail = t
  64     etree.strip_tags(tempnode, '*')
  65     return tempnode.text
  66
  67
  68 def xslt(xml, sheet):
  69     if isinstance(xml, etree._Element):
  70         xml = etree.ElementTree(xml)
  71     with open(sheet) as xsltf:
  72         return xml.xslt(etree.parse(xsltf))
  73
  74
  75 def replace_characters(node):
  76     def replace_chars(text):
  77         if text is None:
  78             return None
  79         return text.replace(u"\ufeff", u"")\
  80                    .replace("---", u"\u2014")\
  81                    .replace("--", u"\u2013")\
  82                    .replace(",,", u"\u201E")\
  83                    .replace('"', u"\u201D")\
  84                    .replace("'", u"\u2019")
  85     if node.tag in ('uwaga', 'extra'):
  86         t = node.tail
  87         node.clear()
  88         node.tail = t
  89     node.text = replace_chars(node.text)
  90     node.tail = replace_chars(node.tail)
  91     for child in node:
  92         replace_characters(child)
  93
  94
  95 def find_annotations(annotations, source, part_no):
  96     for child in source:
  97         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  98             annotation = deepcopy(child)
  99             number = str(len(annotations)+1)
 100             annotation.set('number', number)
 101             annotation.set('part', str(part_no))
 102             annotation.tail = ''
 103             annotations.append(annotation)
 104             tail = child.tail
 105             child.clear()
 106             child.tail = tail
 107             child.text = number
 108         if child.tag not in ('extra', 'uwaga'):
 109             find_annotations(annotations, child, part_no)
 110
 111
 112 def replace_by_verse(tree):
 113     """ Find stanzas and create new verses in place of a '/' character """
 114
 115     stanzas = tree.findall('.//' + WLNS('strofa'))
 116     for node in stanzas:
 117         for child_node in node:
 118             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 119                 foreign_verses = inner_xml(child_node).split('/\n')
 120                 if len(foreign_verses) > 1:
 121                     new_foreign = ''
 122                     for foreign_verse in foreign_verses:
 123                         if foreign_verse.startswith('<wers'):
 124                             new_foreign += foreign_verse
 125                         else:
 126                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 127                     set_inner_xml(child_node, new_foreign)
 128         verses = inner_xml(node).split('/\n')
 129         if len(verses) > 1:
 130             modified_inner_xml = ''
 131             for verse in verses:
 132                 if verse.startswith('<wers') or verse.startswith('<extra'):
 133                     modified_inner_xml += verse
 134                 else:
 135                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 136             set_inner_xml(node, modified_inner_xml)
 137
 138
 139 def add_to_manifest(manifest, partno):
 140     """ Adds a node to the manifest section in content.opf file """
 141
 142     partstr = 'part%d' % partno
 143     e = manifest.makeelement(OPFNS('item'), attrib={
 144                                  'id': partstr,
 145                                  'href': partstr + '.html',
 146                                  'media-type': 'application/xhtml+xml',
 147                              })
 148     manifest.append(e)
 149
 150
 151 def add_to_spine(spine, partno):
 152     """ Adds a node to the spine section in content.opf file """
 153
 154     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 155     spine.append(e)
 156
 157
 158 class TOC(object):
 159     def __init__(self, name=None, part_href=None):
 160         self.children = []
 161         self.name = name
 162         self.part_href = part_href
 163         self.sub_number = None
 164
 165     def add(self, name, part_href, level=0, is_part=True, index=None):
 166         assert level == 0 or index is None
 167         if level > 0 and self.children:
 168             return self.children[-1].add(name, part_href, level-1, is_part)
 169         else:
 170             t = TOC(name)
 171             t.part_href = part_href
 172             if index is not None:
 173                 self.children.insert(index, t)
 174             else:
 175                 self.children.append(t)
 176             if not is_part:
 177                 t.sub_number = len(self.children) + 1
 178                 return t.sub_number
 179
 180     def append(self, toc):
 181         self.children.append(toc)
 182
 183     def extend(self, toc):
 184         self.children.extend(toc.children)
 185
 186     def depth(self):
 187         if self.children:
 188             return max((c.depth() for c in self.children)) + 1
 189         else:
 190             return 0
 191
 192     def href(self):
 193         src = self.part_href
 194         if self.sub_number is not None:
 195             src += '#sub%d' % self.sub_number
 196         return src
 197
 198     def write_to_xml(self, nav_map, counter=1):
 199         for child in self.children:
 200             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 201             nav_point.set('id', 'NavPoint-%d' % counter)
 202             nav_point.set('playOrder', str(counter))
 203
 204             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 205             text = nav_map.makeelement(NCXNS('text'))
 206             text.text = child.name
 207             nav_label.append(text)
 208             nav_point.append(nav_label)
 209
 210             content = nav_map.makeelement(NCXNS('content'))
 211             content.set('src', child.href())
 212             nav_point.append(content)
 213             nav_map.append(nav_point)
 214             counter = child.write_to_xml(nav_point, counter + 1)
 215         return counter
 216
 217     def html_part(self, depth=0):
 218         texts = []
 219         for child in self.children:
 220             texts.append(
 221                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 222                 (depth, child.href(), child.name))
 223             texts.append(child.html_part(depth+1))
 224         return "\n".join(texts)
 225
 226     def html(self):
 227         with open(get_resource('epub/toc.html')) as f:
 228             t = unicode(f.read(), 'utf-8')
 229         return t % self.html_part()
 230
 231
 232 def used_chars(element):
 233     """ Lists characters used in an ETree Element """
 234     chars = set((element.text or '') + (element.tail or ''))
 235     for child in element:
 236         chars = chars.union(used_chars(child))
 237     return chars
 238
 239
 240 def chop(main_text):
 241     """ divide main content of the XML file into chunks """
 242
 243     # prepare a container for each chunk
 244     part_xml = etree.Element('utwor')
 245     etree.SubElement(part_xml, 'master')
 246     main_xml_part = part_xml[0] # master
 247
 248     last_node_part = False
 249     for one_part in main_text:
 250         name = one_part.tag
 251         if name == 'naglowek_czesc':
 252             yield part_xml
 253             last_node_part = True
 254             main_xml_part[:] = [deepcopy(one_part)]
 255         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 256             yield part_xml
 257             main_xml_part[:] = [deepcopy(one_part)]
 258         else:
 259             main_xml_part.append(deepcopy(one_part))
 260             last_node_part = False
 261     yield part_xml
 262
 263
 264 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 265     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 266
 267     toc = TOC()
 268     for element in chunk_xml[0]:
 269         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 270             toc.add(node_name(element), "part%d.html" % chunk_no)
 271         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 272             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 273             element.set('sub', str(subnumber))
 274     if empty:
 275         if not _empty_html_static:
 276             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 277         chars = set()
 278         output_html = _empty_html_static[0]
 279     else:
 280         find_annotations(annotations, chunk_xml, chunk_no)
 281         replace_by_verse(chunk_xml)
 282         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 283         chars = used_chars(html_tree.getroot())
 284         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 285     return output_html, toc, chars
 286
 287
 288 def transform(wldoc, verbose=False,
 289               style=None, html_toc=False,
 290               sample=None, cover=None, flags=None):
 291     """ produces a EPUB file
 292
 293     sample=n: generate sample e-book (with at least n paragraphs)
 294     cover: a cover.Cover factory or True for default
 295     flags: less-advertising, without-fonts, working-copy
 296     """
 297
 298     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 299         """ processes one input file and proceeds to its children """
 300
 301         replace_characters(wldoc.edoc.getroot())
 302
 303         # every input file will have a TOC entry,
 304         # pointing to starting chunk
 305         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 306         chars = set()
 307         if first:
 308             # write book title page
 309             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 310             chars = used_chars(html_tree.getroot())
 311             zip.writestr('OPS/title.html',
 312                  etree.tostring(html_tree, method="html", pretty_print=True))
 313             # add a title page TOC entry
 314             toc.add(u"Strona tytułowa", "title.html")
 315         elif wldoc.book_info.parts:
 316             # write title page for every parent
 317             if sample is not None and sample <= 0:
 318                 chars = set()
 319                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 320             else:
 321                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 322                 chars = used_chars(html_tree.getroot())
 323                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 324             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 325             add_to_manifest(manifest, chunk_counter)
 326             add_to_spine(spine, chunk_counter)
 327             chunk_counter += 1
 328
 329         if len(wldoc.edoc.getroot()) > 1:
 330             # rdf before style master
 331             main_text = wldoc.edoc.getroot()[1]
 332         else:
 333             # rdf in style master
 334             main_text = wldoc.edoc.getroot()[0]
 335             if main_text.tag == RDFNS('RDF'):
 336                 main_text = None
 337
 338         if main_text is not None:
 339             for chunk_xml in chop(main_text):
 340                 empty = False
 341                 if sample is not None:
 342                     if sample <= 0:
 343                         empty = True
 344                     else:
 345                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 346                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 347
 348                 toc.extend(chunk_toc)
 349                 chars = chars.union(chunk_chars)
 350                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 351                 add_to_manifest(manifest, chunk_counter)
 352                 add_to_spine(spine, chunk_counter)
 353                 chunk_counter += 1
 354
 355         for child in wldoc.parts():
 356             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 357                 child, chunk_counter, first=False, sample=sample)
 358             toc.append(child_toc)
 359             chars = chars.union(chunk_chars)
 360
 361         return toc, chunk_counter, chars, sample
 362
 363
 364     document = deepcopy(wldoc)
 365     del wldoc
 366
 367     if flags:
 368         for flag in flags:
 369             document.edoc.getroot().set(flag, 'yes')
 370
 371     # add editors info
 372     document.edoc.getroot().set('editors', u', '.join(sorted(
 373         editor.readable() for editor in document.editors())))
 374
 375     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 376     manifest = opf.find('.//' + OPFNS('manifest'))
 377     guide = opf.find('.//' + OPFNS('guide'))
 378     spine = opf.find('.//' + OPFNS('spine'))
 379
 380     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 381     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 382
 383     # write static elements
 384     mime = zipfile.ZipInfo()
 385     mime.filename = 'mimetype'
 386     mime.compress_type = zipfile.ZIP_STORED
 387     mime.extra = ''
 388     zip.writestr(mime, 'application/epub+zip')
 389     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 390                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 391                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 392                        'media-type="application/oebps-package+xml" />' \
 393                        '</rootfiles></container>')
 394     zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 395     zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 396     if not style:
 397         style = get_resource('epub/style.css')
 398     zip.write(style, os.path.join('OPS', 'style.css'))
 399
 400     if cover:
 401         if cover is True:
 402             cover = WLCover
 403
 404         cover_file = StringIO()
 405         bound_cover = cover(document.book_info)
 406         bound_cover.save(cover_file)
 407         cover_name = 'cover.%s' % bound_cover.ext()
 408         zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
 409         del cover_file
 410
 411         cover_tree = etree.parse(get_resource('epub/cover.html'))
 412         cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
 413         zip.writestr('OPS/cover.html', etree.tostring(
 414                         cover_tree, method="html", pretty_print=True))
 415
 416         if bound_cover.uses_dc_cover:
 417             if document.book_info.cover_by:
 418                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 419             if document.book_info.cover_source:
 420                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 421
 422         manifest.append(etree.fromstring(
 423             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 424         manifest.append(etree.fromstring(
 425             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
 426         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 427         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 428         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 429
 430
 431     annotations = etree.Element('annotations')
 432
 433     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 434                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 435                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 436                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 437                                '</navMap></ncx>')
 438     nav_map = toc_file[-1]
 439
 440     if html_toc:
 441         manifest.append(etree.fromstring(
 442             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 443         spine.append(etree.fromstring(
 444             '<itemref idref="html_toc" />'))
 445         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
 446
 447     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 448
 449     if len(toc.children) < 2:
 450         toc.add(u"Początek utworu", "part1.html")
 451
 452     # Last modifications in container files and EPUB creation
 453     if len(annotations) > 0:
 454         toc.add("Przypisy", "annotations.html")
 455         manifest.append(etree.fromstring(
 456             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 457         spine.append(etree.fromstring(
 458             '<itemref idref="annotations" />'))
 459         replace_by_verse(annotations)
 460         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 461         chars = chars.union(used_chars(html_tree.getroot()))
 462         zip.writestr('OPS/annotations.html', etree.tostring(
 463                             html_tree, method="html", pretty_print=True))
 464
 465     toc.add("Strona redakcyjna", "last.html")
 466     manifest.append(etree.fromstring(
 467         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 468     spine.append(etree.fromstring(
 469         '<itemref idref="last" />'))
 470     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 471     chars.update(used_chars(html_tree.getroot()))
 472     zip.writestr('OPS/last.html', etree.tostring(
 473                         html_tree, method="html", pretty_print=True))
 474
 475     if not flags or not 'without-fonts' in flags:
 476         # strip fonts
 477         tmpdir = mkdtemp('-librarian-epub')
 478         try:
 479             cwd = os.getcwd()
 480         except OSError:
 481             cwd = None
 482
 483         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 484         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 485             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 486                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 487             if verbose:
 488                 print "Running font-optimizer"
 489                 subprocess.check_call(optimizer_call)
 490             else:
 491                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 492             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 493             manifest.append(etree.fromstring(
 494                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 495         rmtree(tmpdir)
 496         if cwd is not None:
 497             os.chdir(cwd)
 498
 499     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 500     title = document.book_info.title
 501     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 502     for st in attributes:
 503         meta = toc_file.makeelement(NCXNS('meta'))
 504         meta.set('name', st)
 505         meta.set('content', '0')
 506         toc_file[0].append(meta)
 507     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 508     toc_file[0][1].set('content', str(toc.depth()))
 509     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 510
 511     # write TOC
 512     if html_toc:
 513         toc.add(u"Spis treści", "toc.html", index=1)
 514         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 515     toc.write_to_xml(nav_map)
 516     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 517     zip.close()
 518
 519     return OutputFile.from_filename(output_file.name)