librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7
   8 from copy import deepcopy
   9 import os
  10 import os.path
  11 import subprocess
  12 from StringIO import StringIO
  13 from copy import deepcopy
  14 from lxml import etree
  15 import zipfile
  16 from tempfile import mkdtemp, NamedTemporaryFile
  17 from shutil import rmtree
  18
  19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
  20 from librarian.cover import WLCover
  21
  22 from librarian import functions, get_resource
  23
  24 functions.reg_person_name()
  25
  26
  27 def inner_xml(node):
  28     """ returns node's text and children as a string
  29
  30     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
  31     x<b>y</b>z
  32     """
  33
  34     nt = node.text if node.text is not None else ''
  35     return ''.join([nt] + [etree.tostring(child) for child in node])
  36
  37 def set_inner_xml(node, text):
  38     """ sets node's text and children from a string
  39
  40     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  41     >>> set_inner_xml(e, 'x<b>y</b>z')
  42     >>> print etree.tostring(e)
  43     <a>x<b>y</b>z</a>
  44     """
  45
  46     p = etree.fromstring('<x>%s</x>' % text)
  47     node.text = p.text
  48     node[:] = p[:]
  49
  50
  51 def node_name(node):
  52     """ Find out a node's name
  53
  54     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
  55     XYZ
  56     """
  57
  58     tempnode = deepcopy(node)
  59
  60     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
  61         for e in tempnode.findall('.//%s' % p):
  62             t = e.tail
  63             e.clear()
  64             e.tail = t
  65     etree.strip_tags(tempnode, '*')
  66     return tempnode.text
  67
  68
  69 def xslt(xml, sheet):
  70     if isinstance(xml, etree._Element):
  71         xml = etree.ElementTree(xml)
  72     with open(sheet) as xsltf:
  73         return xml.xslt(etree.parse(xsltf))
  74
  75
  76 def replace_characters(node):
  77     def replace_chars(text):
  78         if text is None:
  79             return None
  80         return text.replace(u"\ufeff", u"")\
  81                    .replace("---", u"\u2014")\
  82                    .replace("--", u"\u2013")\
  83                    .replace(",,", u"\u201E")\
  84                    .replace('"', u"\u201D")\
  85                    .replace("'", u"\u2019")
  86     if node.tag in ('uwaga', 'extra'):
  87         t = node.tail
  88         node.clear()
  89         node.tail = t
  90     node.text = replace_chars(node.text)
  91     node.tail = replace_chars(node.tail)
  92     for child in node:
  93         replace_characters(child)
  94
  95
  96 def find_annotations(annotations, source, part_no):
  97     for child in source:
  98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
  99             annotation = deepcopy(child)
 100             number = str(len(annotations)+1)
 101             annotation.set('number', number)
 102             annotation.set('part', str(part_no))
 103             annotation.tail = ''
 104             annotations.append(annotation)
 105             tail = child.tail
 106             child.clear()
 107             child.tail = tail
 108             child.text = number
 109         if child.tag not in ('extra', 'uwaga'):
 110             find_annotations(annotations, child, part_no)
 111
 112
 113 def replace_by_verse(tree):
 114     """ Find stanzas and create new verses in place of a '/' character """
 115
 116     stanzas = tree.findall('.//' + WLNS('strofa'))
 117     for node in stanzas:
 118         for child_node in node:
 119             if child_node.tag in ('slowo_obce', 'wyroznienie'):
 120                 foreign_verses = inner_xml(child_node).split('/\n')
 121                 if len(foreign_verses) > 1:
 122                     new_foreign = ''
 123                     for foreign_verse in foreign_verses:
 124                         if foreign_verse.startswith('<wers'):
 125                             new_foreign += foreign_verse
 126                         else:
 127                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
 128                     set_inner_xml(child_node, new_foreign)
 129         verses = inner_xml(node).split('/\n')
 130         if len(verses) > 1:
 131             modified_inner_xml = ''
 132             for verse in verses:
 133                 if verse.startswith('<wers') or verse.startswith('<extra'):
 134                     modified_inner_xml += verse
 135                 else:
 136                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
 137             set_inner_xml(node, modified_inner_xml)
 138
 139
 140 def add_to_manifest(manifest, partno):
 141     """ Adds a node to the manifest section in content.opf file """
 142
 143     partstr = 'part%d' % partno
 144     e = manifest.makeelement(OPFNS('item'), attrib={
 145                                  'id': partstr,
 146                                  'href': partstr + '.html',
 147                                  'media-type': 'application/xhtml+xml',
 148                              })
 149     manifest.append(e)
 150
 151
 152 def add_to_spine(spine, partno):
 153     """ Adds a node to the spine section in content.opf file """
 154
 155     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
 156     spine.append(e)
 157
 158
 159 class TOC(object):
 160     def __init__(self, name=None, part_href=None):
 161         self.children = []
 162         self.name = name
 163         self.part_href = part_href
 164         self.sub_number = None
 165
 166     def add(self, name, part_href, level=0, is_part=True, index=None):
 167         assert level == 0 or index is None
 168         if level > 0 and self.children:
 169             return self.children[-1].add(name, part_href, level-1, is_part)
 170         else:
 171             t = TOC(name)
 172             t.part_href = part_href
 173             if index is not None:
 174                 self.children.insert(index, t)
 175             else:
 176                 self.children.append(t)
 177             if not is_part:
 178                 t.sub_number = len(self.children) + 1
 179                 return t.sub_number
 180
 181     def append(self, toc):
 182         self.children.append(toc)
 183
 184     def extend(self, toc):
 185         self.children.extend(toc.children)
 186
 187     def depth(self):
 188         if self.children:
 189             return max((c.depth() for c in self.children)) + 1
 190         else:
 191             return 0
 192
 193     def href(self):
 194         src = self.part_href
 195         if self.sub_number is not None:
 196             src += '#sub%d' % self.sub_number
 197         return src
 198
 199     def write_to_xml(self, nav_map, counter=1):
 200         for child in self.children:
 201             nav_point = nav_map.makeelement(NCXNS('navPoint'))
 202             nav_point.set('id', 'NavPoint-%d' % counter)
 203             nav_point.set('playOrder', str(counter))
 204
 205             nav_label = nav_map.makeelement(NCXNS('navLabel'))
 206             text = nav_map.makeelement(NCXNS('text'))
 207             text.text = child.name
 208             nav_label.append(text)
 209             nav_point.append(nav_label)
 210
 211             content = nav_map.makeelement(NCXNS('content'))
 212             content.set('src', child.href())
 213             nav_point.append(content)
 214             nav_map.append(nav_point)
 215             counter = child.write_to_xml(nav_point, counter + 1)
 216         return counter
 217
 218     def html_part(self, depth=0):
 219         texts = []
 220         for child in self.children:
 221             texts.append(
 222                 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
 223                 (depth, child.href(), child.name))
 224             texts.append(child.html_part(depth+1))
 225         return "\n".join(texts)
 226
 227     def html(self):
 228         with open(get_resource('epub/toc.html')) as f:
 229             t = unicode(f.read(), 'utf-8')
 230         return t % self.html_part()
 231
 232
 233 def used_chars(element):
 234     """ Lists characters used in an ETree Element """
 235     chars = set((element.text or '') + (element.tail or ''))
 236     for child in element:
 237         chars = chars.union(used_chars(child))
 238     return chars
 239
 240
 241 def chop(main_text):
 242     """ divide main content of the XML file into chunks """
 243
 244     # prepare a container for each chunk
 245     part_xml = etree.Element('utwor')
 246     etree.SubElement(part_xml, 'master')
 247     main_xml_part = part_xml[0] # master
 248
 249     last_node_part = False
 250     for one_part in main_text:
 251         name = one_part.tag
 252         if name == 'naglowek_czesc':
 253             yield part_xml
 254             last_node_part = True
 255             main_xml_part[:] = [deepcopy(one_part)]
 256         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 257             yield part_xml
 258             main_xml_part[:] = [deepcopy(one_part)]
 259         else:
 260             main_xml_part.append(deepcopy(one_part))
 261             last_node_part = False
 262     yield part_xml
 263
 264
 265 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
 266     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 267
 268     toc = TOC()
 269     for element in chunk_xml[0]:
 270         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 271             toc.add(node_name(element), "part%d.html" % chunk_no)
 272         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 273             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
 274             element.set('sub', str(subnumber))
 275     if empty:
 276         if not _empty_html_static:
 277             _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
 278         chars = set()
 279         output_html = _empty_html_static[0]
 280     else:
 281         find_annotations(annotations, chunk_xml, chunk_no)
 282         replace_by_verse(chunk_xml)
 283         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 284         chars = used_chars(html_tree.getroot())
 285         output_html = etree.tostring(html_tree, method="html", pretty_print=True)
 286     return output_html, toc, chars
 287
 288
 289 def transform(wldoc, verbose=False,
 290               style=None, html_toc=False,
 291               sample=None, cover=None, flags=None):
 292     """ produces a EPUB file
 293
 294     sample=n: generate sample e-book (with at least n paragraphs)
 295     cover: a cover.Cover object or True for default
 296     flags: less-advertising, without-fonts, images, not-wl
 297     """
 298
 299     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 300         """ processes one input file and proceeds to its children """
 301
 302         replace_characters(wldoc.edoc.getroot())
 303
 304         # every input file will have a TOC entry,
 305         # pointing to starting chunk
 306         toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
 307         chars = set()
 308         if first:
 309             # write book title page
 310             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
 311             chars = used_chars(html_tree.getroot())
 312             zip.writestr('OPS/title.html',
 313                  etree.tostring(html_tree, method="html", pretty_print=True))
 314             # add a title page TOC entry
 315             toc.add(u"Strona tytułowa", "title.html")
 316         elif wldoc.book_info.parts:
 317             # write title page for every parent
 318             if sample is not None and sample <= 0:
 319                 chars = set()
 320                 html_string = open(get_resource('epub/emptyChunk.html')).read()
 321             else:
 322                 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
 323                 chars = used_chars(html_tree.getroot())
 324                 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
 325             zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
 326             add_to_manifest(manifest, chunk_counter)
 327             add_to_spine(spine, chunk_counter)
 328             chunk_counter += 1
 329
 330         if len(wldoc.edoc.getroot()) > 1:
 331             # rdf before style master
 332             main_text = wldoc.edoc.getroot()[1]
 333         else:
 334             # rdf in style master
 335             main_text = wldoc.edoc.getroot()[0]
 336             if main_text.tag == RDFNS('RDF'):
 337                 main_text = None
 338
 339         if main_text is not None:
 340             for chunk_xml in chop(main_text):
 341                 empty = False
 342                 if sample is not None:
 343                     if sample <= 0:
 344                         empty = True
 345                     else:
 346                         sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
 347                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
 348
 349                 toc.extend(chunk_toc)
 350                 chars = chars.union(chunk_chars)
 351                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
 352                 add_to_manifest(manifest, chunk_counter)
 353                 add_to_spine(spine, chunk_counter)
 354                 chunk_counter += 1
 355
 356         for child in wldoc.parts():
 357             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 358                 child, chunk_counter, first=False, sample=sample)
 359             toc.append(child_toc)
 360             chars = chars.union(chunk_chars)
 361
 362         return toc, chunk_counter, chars, sample
 363
 364
 365     document = deepcopy(wldoc)
 366     del wldoc
 367
 368     if flags:
 369         for flag in flags:
 370             document.edoc.getroot().set(flag, 'yes')
 371
 372     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
 373     manifest = opf.find('.//' + OPFNS('manifest'))
 374     guide = opf.find('.//' + OPFNS('guide'))
 375     spine = opf.find('.//' + OPFNS('spine'))
 376
 377     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
 378
 379     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 380
 381     # write static elements
 382     mime = zipfile.ZipInfo()
 383     mime.filename = 'mimetype'
 384     mime.compress_type = zipfile.ZIP_STORED
 385     mime.extra = ''
 386     zip.writestr(mime, 'application/epub+zip')
 387     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
 388                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
 389                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
 390                        'media-type="application/oebps-package+xml" />' \
 391                        '</rootfiles></container>')
 392     if not flags or 'not-wl' not in flags:
 393         manifest.append(etree.fromstring(
 394             '<item id="logo_wolnelektury" href="logo_wolnelektury.png" media-type="image/png" />'))
 395         manifest.append(etree.fromstring(
 396             '<item id="jedenprocent" href="jedenprocent.png" media-type="image/png" />'))
 397         zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
 398         zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
 399
 400     if not style:
 401         style = get_resource('epub/style.css')
 402     zip.write(style, os.path.join('OPS', 'style.css'))
 403
 404     if cover:
 405         if cover is True:
 406             cover = WLCover
 407         if cover.uses_dc_cover:
 408             if document.book_info.cover_by:
 409                 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 410             if document.book_info.cover_source:
 411                 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 412
 413         cover_file = StringIO()
 414         c = cover(document.book_info)
 415         c.save(cover_file)
 416         c_name = 'cover.%s' % c.ext()
 417         zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
 418         del cover_file
 419
 420         cover_tree = etree.parse(get_resource('epub/cover.html'))
 421         cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
 422         zip.writestr('OPS/cover.html', etree.tostring(
 423                         cover_tree, method="html", pretty_print=True))
 424
 425         manifest.append(etree.fromstring(
 426             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
 427         manifest.append(etree.fromstring(
 428             '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
 429         spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
 430         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
 431         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 432
 433     if flags and 'images' in flags:
 434         for ilustr in document.edoc.findall('//ilustr'):
 435             src = ilustr.get('src')
 436             mime = ImageCover(src)().mime_type()
 437             zip.write(src, os.path.join('OPS', src))
 438             manifest.append(etree.fromstring(
 439                 '<item id="%s" href="%s" media-type="%s" />' % (src, src, mime)))
 440             # get it up to master
 441             after = ilustr
 442             while after.getparent().tag not in ['powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp', 'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny']:
 443                 after = after.getparent()
 444             if not(after is ilustr):
 445                 moved = deepcopy(ilustr)
 446                 ilustr.tag = 'extra'
 447                 ilustr.text = None
 448                 moved.tail = None
 449                 after.addnext(moved)
 450     else:
 451         for ilustr in document.edoc.findall('//ilustr'):
 452             ilustr.tag = 'extra'
 453
 454     annotations = etree.Element('annotations')
 455
 456     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
 457                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
 458                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
 459                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
 460                                '</navMap></ncx>')
 461     nav_map = toc_file[-1]
 462
 463     if html_toc:
 464         manifest.append(etree.fromstring(
 465             '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
 466         spine.append(etree.fromstring(
 467             '<itemref idref="html_toc" />'))
 468         guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
 469
 470     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 471
 472     if len(toc.children) < 2:
 473         toc.add(u"Początek utworu", "part1.html")
 474
 475     # Last modifications in container files and EPUB creation
 476     if len(annotations) > 0:
 477         toc.add("Przypisy", "annotations.html")
 478         manifest.append(etree.fromstring(
 479             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
 480         spine.append(etree.fromstring(
 481             '<itemref idref="annotations" />'))
 482         replace_by_verse(annotations)
 483         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 484         chars = chars.union(used_chars(html_tree.getroot()))
 485         zip.writestr('OPS/annotations.html', etree.tostring(
 486                             html_tree, method="html", pretty_print=True))
 487
 488     toc.add("Strona redakcyjna", "last.html")
 489     manifest.append(etree.fromstring(
 490         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
 491     spine.append(etree.fromstring(
 492         '<itemref idref="last" />'))
 493     stopka = document.edoc.find('//stopka')
 494     if stopka is not None:
 495         stopka.tag = 'stopka_'
 496         replace_by_verse(stopka)
 497         html_tree = xslt(stopka, get_resource('epub/xsltScheme.xsl'))
 498     else:
 499         html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
 500     chars.update(used_chars(html_tree.getroot()))
 501     zip.writestr('OPS/last.html', etree.tostring(
 502                         html_tree, method="html", pretty_print=True))
 503
 504     if not flags or not 'without-fonts' in flags:
 505         # strip fonts
 506         tmpdir = mkdtemp('-librarian-epub')
 507         cwd = os.getcwd()
 508
 509         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
 510         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 511             optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
 512                               get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
 513             if verbose:
 514                 print "Running font-optimizer"
 515                 subprocess.check_call(optimizer_call)
 516             else:
 517                 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 518             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
 519             manifest.append(etree.fromstring(
 520                 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
 521         rmtree(tmpdir)
 522         os.chdir(cwd)
 523
 524     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
 525     title = document.book_info.title
 526     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
 527     for st in attributes:
 528         meta = toc_file.makeelement(NCXNS('meta'))
 529         meta.set('name', st)
 530         meta.set('content', '0')
 531         toc_file[0].append(meta)
 532     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
 533     toc_file[0][1].set('content', str(toc.depth()))
 534     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 535
 536     # write TOC
 537     if html_toc:
 538         toc.add(u"Spis treści", "toc.html", index=1)
 539         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
 540     toc.write_to_xml(nav_map)
 541     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
 542     zip.close()
 543
 544     return OutputFile.from_filename(output_file.name)