src/librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import os.path
  10 import re
  11 import subprocess
  12 import six
  13 from copy import deepcopy
  14 from mimetypes import guess_type
  15
  16 from ebooklib import epub
  17 from lxml import etree
  18 from PIL import Image
  19 from tempfile import mkdtemp, NamedTemporaryFile
  20 from shutil import rmtree
  21
  22 from librarian import RDFNS, WLNS, DCNS, OutputFile
  23 from librarian.cover import make_cover
  24
  25 from librarian import functions, get_resource
  26
  27 from librarian.hyphenator import Hyphenator
  28
  29 functions.reg_person_name()
  30
  31
  32 def squeeze_whitespace(s):
  33     return re.sub(b'\\s+', b' ', s)
  34
  35
  36 def set_hyph_language(source_tree):
  37     bibl_lng = etree.XPath('//dc:language//text()',
  38                            namespaces={'dc': str(DCNS)})(source_tree)
  39     short_lng = functions.lang_code_3to2(bibl_lng[0])
  40     try:
  41         return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
  42                                        short_lng + '.dic'))
  43     except:
  44         pass
  45
  46
  47 def hyphenate_and_fix_conjunctions(source_tree, hyph):
  48     texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
  49     for t in texts:
  50         parent = t.getparent()
  51         if hyph is not None:
  52             newt = ''
  53             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
  54             for w in wlist:
  55                 newt += hyph.inserted(w, u'\u00AD')
  56         else:
  57             newt = t
  58         newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
  59         if t.is_text:
  60             parent.text = newt
  61         elif t.is_tail:
  62             parent.tail = newt
  63
  64
  65 def inner_xml(node):
  66     """ returns node's text and children as a string
  67
  68     >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
  69     x<b>y</b>z
  70     """
  71
  72     nt = node.text if node.text is not None else ''
  73     return ''.join(
  74         [nt] + [etree.tostring(child, encoding='unicode') for child in node]
  75     )
  76
  77
  78 def set_inner_xml(node, text):
  79     """ sets node's text and children from a string
  80
  81     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  82     >>> set_inner_xml(e, 'x<b>y</b>z')
  83     >>> print(etree.tostring(e, encoding='unicode'))
  84     <a>x<b>y</b>z</a>
  85     """
  86
  87     p = etree.fromstring('<x>%s</x>' % text)
  88     node.text = p.text
  89     node[:] = p[:]
  90
  91
  92 def node_name(node):
  93     """ Find out a node's name
  94
  95     >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
  96     XYZ
  97     """
  98
  99     tempnode = deepcopy(node)
 100
 101     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
 102         for e in tempnode.findall('.//%s' % p):
 103             t = e.tail
 104             e.clear()
 105             e.tail = t
 106     etree.strip_tags(tempnode, '*')
 107     return tempnode.text
 108
 109
 110 def xslt(xml, sheet, **kwargs):
 111     if isinstance(xml, etree._Element):
 112         xml = etree.ElementTree(xml)
 113     with open(sheet) as xsltf:
 114         transform = etree.XSLT(etree.parse(xsltf))
 115         params = dict(
 116             (key, transform.strparam(value))
 117             for key, value in kwargs.items()
 118         )
 119         return transform(xml, **params)
 120
 121
 122 def replace_characters(node):
 123     def replace_chars(text):
 124         if text is None:
 125             return None
 126         return text.replace(u"\ufeff", u"")\
 127                    .replace("---", u"\u2014")\
 128                    .replace("--", u"\u2013")\
 129                    .replace(",,", u"\u201E")\
 130                    .replace('"', u"\u201D")\
 131                    .replace("'", u"\u2019")
 132     if node.tag in ('uwaga', 'extra'):
 133         t = node.tail
 134         node.clear()
 135         node.tail = t
 136     node.text = replace_chars(node.text)
 137     node.tail = replace_chars(node.tail)
 138     for child in node:
 139         replace_characters(child)
 140
 141
 142 def find_annotations(annotations, source, part_no):
 143     for child in source:
 144         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 145             annotation = deepcopy(child)
 146             number = str(len(annotations) + 1)
 147             annotation.set('number', number)
 148             annotation.set('part', str(part_no))
 149             annotation.tail = ''
 150             annotations.append(annotation)
 151             tail = child.tail
 152             child.clear()
 153             child.tail = tail
 154             child.text = number
 155         if child.tag not in ('extra', 'uwaga'):
 156             find_annotations(annotations, child, part_no)
 157
 158
 159 class Stanza(object):
 160     """
 161     Converts / verse endings into verse elements in a stanza.
 162
 163     Slashes may only occur directly in the stanza. Any slashes in subelements
 164     will be ignored, and the subelements will be put inside verse elements.
 165
 166     >>> s = etree.fromstring(
 167     ...         "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
 168     ...     )
 169     >>> Stanza(s).versify()
 170     >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
 171     <strofa>
 172       <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
 173       <wers_normalny>b<x>x/
 174     y</x>c</wers_normalny>
 175       <wers_normalny>d</wers_normalny>
 176     </strofa>
 177
 178     """
 179     def __init__(self, stanza_elem):
 180         self.stanza = stanza_elem
 181         self.verses = []
 182         self.open_verse = None
 183
 184     def versify(self):
 185         self.push_text(self.stanza.text)
 186         for elem in self.stanza:
 187             self.push_elem(elem)
 188             self.push_text(elem.tail)
 189         tail = self.stanza.tail
 190         self.stanza.clear()
 191         self.stanza.tail = tail
 192         self.stanza.extend(
 193             verse for verse in self.verses
 194             if verse.text or len(verse) > 0
 195         )
 196
 197     def open_normal_verse(self):
 198         self.open_verse = self.stanza.makeelement("wers_normalny")
 199         self.verses.append(self.open_verse)
 200
 201     def get_open_verse(self):
 202         if self.open_verse is None:
 203             self.open_normal_verse()
 204         return self.open_verse
 205
 206     def push_text(self, text):
 207         if not text:
 208             return
 209         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 210             if i:
 211                 self.open_normal_verse()
 212             if not verse_text.strip():
 213                 continue
 214             verse = self.get_open_verse()
 215             if len(verse):
 216                 verse[-1].tail = (verse[-1].tail or "") + verse_text
 217             else:
 218                 verse.text = (verse.text or "") + verse_text
 219
 220     def push_elem(self, elem):
 221         if elem.tag.startswith("wers"):
 222             verse = deepcopy(elem)
 223             verse.tail = None
 224             self.verses.append(verse)
 225             self.open_verse = verse
 226         else:
 227             appended = deepcopy(elem)
 228             appended.tail = None
 229             self.get_open_verse().append(appended)
 230
 231
 232 def replace_by_verse(tree):
 233     """ Find stanzas and create new verses in place of a '/' character """
 234
 235     stanzas = tree.findall('.//' + WLNS('strofa'))
 236     for stanza in stanzas:
 237         Stanza(stanza).versify()
 238
 239
 240 def used_chars(element):
 241     """ Lists characters used in an ETree Element """
 242     chars = set((element.text or '') + (element.tail or ''))
 243     for child in element:
 244         chars = chars.union(used_chars(child))
 245     return chars
 246
 247
 248 def chop(main_text):
 249     """ divide main content of the XML file into chunks """
 250
 251     # prepare a container for each chunk
 252     part_xml = etree.Element('utwor')
 253     etree.SubElement(part_xml, 'master')
 254     main_xml_part = part_xml[0]  # master
 255
 256     last_node_part = False
 257
 258     # The below loop are workaround for a problem with epubs
 259     # in drama ebooks without acts.
 260     is_scene = False
 261     is_act = False
 262     for one_part in main_text:
 263         name = one_part.tag
 264         if name == 'naglowek_scena':
 265             is_scene = True
 266         elif name == 'naglowek_akt':
 267             is_act = True
 268
 269     for one_part in main_text:
 270         name = one_part.tag
 271         if is_act is False and is_scene is True:
 272             if name == 'naglowek_czesc':
 273                 yield part_xml
 274                 last_node_part = True
 275                 main_xml_part[:] = [deepcopy(one_part)]
 276             elif not last_node_part and name == "naglowek_scena":
 277                 yield part_xml
 278                 main_xml_part[:] = [deepcopy(one_part)]
 279             else:
 280                 main_xml_part.append(deepcopy(one_part))
 281                 last_node_part = False
 282         else:
 283             if name == 'naglowek_czesc':
 284                 yield part_xml
 285                 last_node_part = True
 286                 main_xml_part[:] = [deepcopy(one_part)]
 287             elif (not last_node_part
 288                   and name in (
 289                       "naglowek_rozdzial", "naglowek_akt", "srodtytul"
 290                   )):
 291                 yield part_xml
 292                 main_xml_part[:] = [deepcopy(one_part)]
 293             else:
 294                 main_xml_part.append(deepcopy(one_part))
 295                 last_node_part = False
 296     yield part_xml
 297
 298
 299 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
 300                     _empty_html_static=[]):
 301     """
 302     Transforms one chunk, returns a HTML string, a TOC object
 303     and a set of used characters.
 304     """
 305
 306     toc = []
 307     for element in chunk_xml[0]:
 308         if element.tag == "naglowek_czesc":
 309             toc.append(
 310                 (
 311                     epub.Link(
 312                         "part%d.xhtml#book-text" % chunk_no,
 313                         node_name(element),
 314                         "part%d-text" % chunk_no
 315                     ),
 316                     []
 317                 )
 318             )
 319         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 320             toc.append(
 321                 (
 322                     epub.Link(
 323                         "part%d.xhtml" % chunk_no,
 324                         node_name(element),
 325                         "part%d" % chunk_no
 326                     ),
 327                     []
 328                 )
 329             )
 330         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 331             if not toc:
 332                 toc.append(
 333                     (
 334                         epub.Link(
 335                             "part%d.xhtml" % chunk_no,
 336                             " ",
 337                             "part%d" % chunk_no
 338                         ),
 339                         []
 340                     )
 341                 )
 342
 343             subnumber = len(toc[-1][1])
 344             toc[-1][1].append(
 345                 epub.Link(
 346                     "part%d.xhtml#sub%d" % (chunk_no, subnumber),
 347                     node_name(element),
 348                     "part%d-sub%d" % (chunk_no, subnumber)
 349                 )
 350             )
 351             element.set('sub', six.text_type(subnumber))
 352     if empty:
 353         if not _empty_html_static:
 354             with open(get_resource('epub/emptyChunk.xhtml')) as f:
 355                 _empty_html_static.append(f.read())
 356         chars = set()
 357         output_html = _empty_html_static[0]
 358     else:
 359         find_annotations(annotations, chunk_xml, chunk_no)
 360         replace_by_verse(chunk_xml)
 361         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 362         chars = used_chars(html_tree.getroot())
 363         output_html = etree.tostring(
 364             html_tree, pretty_print=True, xml_declaration=True,
 365             encoding="utf-8",
 366             doctype='<!DOCTYPE html>'
 367         )
 368     return output_html, toc, chars
 369
 370
 371 def remove_empty_lists_from_toc(toc):
 372     for i, e in enumerate(toc):
 373         if isinstance(e, tuple):
 374             if e[1]:
 375                 remove_empty_lists_from_toc(e[1])
 376             else:
 377                 toc[i] = e[0]
 378
 379
 380 def transform(wldoc, verbose=False, style=None,
 381               sample=None, cover=None, flags=None, hyphenate=False,
 382               base_url='file://./', output_type='epub'):
 383     """ produces a EPUB file
 384
 385     sample=n: generate sample e-book (with at least n paragraphs)
 386     cover: a cover.Cover factory or True for default
 387     flags: less-advertising, without-fonts, working-copy
 388     """
 389
 390     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 391         """ processes one input file and proceeds to its children """
 392
 393         replace_characters(wldoc.edoc.getroot())
 394
 395         hyphenator = set_hyph_language(
 396             wldoc.edoc.getroot()
 397         ) if hyphenate else None
 398         hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
 399
 400         # every input file will have a TOC entry,
 401         # pointing to starting chunk
 402         toc = [
 403             (
 404                 epub.Link(
 405                     "part%d.xhtml" % chunk_counter,
 406                     wldoc.book_info.title,
 407                     "path%d-start" % chunk_counter
 408                 ),
 409                 []
 410             )
 411         ]
 412         chars = set()
 413         if first:
 414             # write book title page
 415             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
 416                              outputtype=output_type)
 417             chars = used_chars(html_tree.getroot())
 418             html_string = etree.tostring(
 419                 html_tree, pretty_print=True, xml_declaration=True,
 420                 encoding="utf-8",
 421                 doctype='<!DOCTYPE html>'
 422             )
 423             item = epub.EpubItem(
 424                 uid="titlePage",
 425                 file_name="title.xhtml",
 426                 media_type="application/xhtml+xml",
 427                 content=squeeze_whitespace(html_string)
 428             )
 429             spine.append(item)
 430             output.add_item(item)
 431             # add a title page TOC entry
 432             toc[-1][1].append(
 433                 epub.Link(
 434                     "title.xhtml",
 435                     "Strona tytułowa",
 436                     "title",
 437                 )
 438             )
 439
 440             item = epub.EpubNav()
 441             toc[-1][1].append(
 442                 epub.Link(
 443                     "nav.xhtml",
 444                     "Spis treści",
 445                     "nav"
 446                 )
 447             )
 448             output.add_item(item)
 449             spine.append(item)
 450
 451         elif wldoc.book_info.parts:
 452             # write title page for every parent
 453             if sample is not None and sample <= 0:
 454                 chars = set()
 455                 html_string = open(
 456                     get_resource('epub/emptyChunk.xhtml')).read()
 457             else:
 458                 html_tree = xslt(wldoc.edoc,
 459                                  get_resource('epub/xsltChunkTitle.xsl'))
 460                 chars = used_chars(html_tree.getroot())
 461                 html_string = etree.tostring(
 462                     html_tree, pretty_print=True, xml_declaration=True,
 463                     encoding="utf-8",
 464                     doctype='<!DOCTYPE html>'
 465                 )
 466             item = epub.EpubItem(
 467                 uid="part%d" % chunk_counter,
 468                 file_name="part%d.xhtml" % chunk_counter,
 469                 media_type="application/xhtml+xml",
 470                 content=squeeze_whitespace(html_string)
 471             )
 472             output.add_item(item)
 473             spine.append(item)
 474
 475             chunk_counter += 1
 476
 477         if len(wldoc.edoc.getroot()) > 1:
 478             # rdf before style master
 479             main_text = wldoc.edoc.getroot()[1]
 480         else:
 481             # rdf in style master
 482             main_text = wldoc.edoc.getroot()[0]
 483             if main_text.tag == RDFNS('RDF'):
 484                 main_text = None
 485
 486         if main_text is not None:
 487             for chunk_xml in chop(main_text):
 488                 empty = False
 489                 if sample is not None:
 490                     if sample <= 0:
 491                         empty = True
 492                     else:
 493                         sample -= len(chunk_xml.xpath(
 494                             '//strofa|//akap|//akap_cd|//akap_dialog'
 495                         ))
 496                 chunk_html, chunk_toc, chunk_chars = transform_chunk(
 497                     chunk_xml, chunk_counter, annotations, empty)
 498
 499                 toc[-1][1].extend(chunk_toc)
 500                 chars = chars.union(chunk_chars)
 501                 item = epub.EpubItem(
 502                     uid="part%d" % chunk_counter,
 503                     file_name="part%d.xhtml" % chunk_counter,
 504                     media_type="application/xhtml+xml",
 505                     content=squeeze_whitespace(chunk_html)
 506                 )
 507                 output.add_item(item)
 508                 spine.append(item)
 509                 chunk_counter += 1
 510
 511         for child in wldoc.parts():
 512             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 513                 child, chunk_counter, first=False, sample=sample)
 514             toc[-1][1].extend(child_toc)
 515             chars = chars.union(chunk_chars)
 516
 517         return toc, chunk_counter, chars, sample
 518
 519     document = deepcopy(wldoc)
 520     del wldoc
 521
 522     if flags:
 523         for flag in flags:
 524             document.edoc.getroot().set(flag, 'yes')
 525
 526     document.clean_ed_note()
 527     document.clean_ed_note('abstrakt')
 528
 529     # add editors info
 530     editors = document.editors()
 531     if editors:
 532         document.edoc.getroot().set('editors', u', '.join(sorted(
 533             editor.readable() for editor in editors)))
 534     if document.book_info.funders:
 535         document.edoc.getroot().set('funders', u', '.join(
 536             document.book_info.funders))
 537     if document.book_info.thanks:
 538         document.edoc.getroot().set('thanks', document.book_info.thanks)
 539
 540     output = epub.EpubBook()
 541     output.set_identifier(six.text_type(document.book_info.url))
 542     output.set_language(functions.lang_code_3to2(document.book_info.language))
 543     output.set_title(document.book_info.title)
 544     for author in document.book_info.authors:
 545         output.add_author(
 546             author.readable(),
 547             file_as=six.text_type(author)
 548         )
 549     for translator in document.book_info.translators:
 550         output.add_author(
 551             translator.readable(),
 552             file_as=six.text_type(translator),
 553             role='translator'
 554         )
 555     for publisher in document.book_info.publisher:
 556         output.add_metadata("DC", "publisher", publisher)
 557     output.add_metadata("DC", "date", document.book_info.created_at)
 558
 559     output.guide.append({
 560         "type": "text",
 561         "title": "Początek",
 562         "href": "part1.xhtml"
 563     })
 564
 565     output.add_item(epub.EpubNcx())
 566
 567     spine = output.spine
 568
 569     functions.reg_mathml_epub(output)
 570
 571     # FIXME
 572     for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
 573         url = six.moves.urllib.parse.urljoin(
 574             base_url,
 575             ilustr.get('src')
 576         )
 577         with six.moves.urllib.request.urlopen(url) as imgfile:
 578             img = Image.open(imgfile)
 579
 580         th_format, ext, media_type = {
 581             'GIF': ('GIF', 'gif', 'image/gif'),
 582             'PNG': ('PNG', 'png', 'image/png'),
 583         }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
 584
 585         width = 1200
 586         if img.size[0] < width:
 587             th = img
 588         else:
 589             th = img.resize((width, round(width * img.size[1] / img.size[0])))
 590
 591         buffer = six.BytesIO()
 592         th.save(buffer, format=th_format)
 593
 594         file_name = 'image%d.%s' % (i, ext)
 595         ilustr.set('src', file_name)
 596         output.add_item(
 597             epub.EpubItem(
 598                 uid='image%s' % i,
 599                 file_name=file_name,
 600                 media_type=media_type,
 601                 content=buffer.getvalue()
 602             )
 603         )
 604
 605     # write static elements
 606
 607     with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
 608         output.add_item(
 609             epub.EpubItem(
 610                 uid="logo_wolnelektury.png",
 611                 file_name="logo_wolnelektury.png",
 612                 media_type="image/png",
 613                 content=f.read()
 614             )
 615         )
 616     with open(get_resource('res/jedenprocent.png'), 'rb') as f:
 617         output.add_item(
 618             epub.EpubItem(
 619                 uid="jedenprocent",
 620                 file_name="jedenprocent.png",
 621                 media_type="image/png",
 622                 content=f.read()
 623             )
 624         )
 625
 626     if not style:
 627         style = get_resource('epub/style.css')
 628     with open(style, 'rb') as f:
 629         output.add_item(
 630             epub.EpubItem(
 631                 uid="style",
 632                 file_name="style.css",
 633                 media_type="text/css",
 634                 content=f.read()
 635             )
 636         )
 637
 638     if cover:
 639         if cover is True:
 640             cover = make_cover
 641
 642         cover_file = six.BytesIO()
 643         bound_cover = cover(document.book_info)
 644         bound_cover.save(cover_file)
 645         cover_name = 'cover.%s' % bound_cover.ext()
 646
 647         output.set_cover(
 648             file_name=cover_name,
 649             content=cover_file.getvalue(),
 650         )
 651         spine.append('cover')
 652         output.guide.append({
 653             "type": "cover",
 654             "href": "cover.xhtml",
 655             "title": "Okładka",
 656         })
 657
 658         del cover_file
 659
 660         if bound_cover.uses_dc_cover:
 661             if document.book_info.cover_by:
 662                 document.edoc.getroot().set('data-cover-by',
 663                                             document.book_info.cover_by)
 664             if document.book_info.cover_source:
 665                 document.edoc.getroot().set('data-cover-source',
 666                                             document.book_info.cover_source)
 667
 668     annotations = etree.Element('annotations')
 669
 670     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 671     output.toc = toc[0][1]
 672
 673     if len(toc) < 2:
 674         output.toc.append(
 675             epub.Link(
 676                 "part1.xhtml",
 677                 "Początek utworu",
 678                 "part1"
 679             )
 680         )
 681
 682     # Last modifications in container files and EPUB creation
 683     if len(annotations) > 0:
 684         output.toc.append(
 685             epub.Link(
 686                 "annotations.xhtml",
 687                 "Przypisy",
 688                 "annotations"
 689             )
 690         )
 691         replace_by_verse(annotations)
 692         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 693         chars = chars.union(used_chars(html_tree.getroot()))
 694
 695         item = epub.EpubItem(
 696             uid="annotations",
 697             file_name="annotations.xhtml",
 698             media_type="application/xhtml+xml",
 699             content=etree.tostring(
 700                 html_tree, pretty_print=True, xml_declaration=True,
 701                 encoding="utf-8",
 702                 doctype='<!DOCTYPE html>'
 703             )
 704         )
 705         output.add_item(item)
 706         spine.append(item)
 707
 708     output.toc.append(
 709         epub.Link(
 710             "support.xhtml",
 711             "Wesprzyj Wolne Lektury",
 712             "support"
 713         )
 714     )
 715     with open(get_resource('epub/support.xhtml'), 'rb') as f:
 716         html_string = f.read()
 717     chars.update(used_chars(etree.fromstring(html_string)))
 718     item = epub.EpubItem(
 719         uid="support",
 720         file_name="support.xhtml",
 721         media_type="application/xhtml+xml",
 722         content=squeeze_whitespace(html_string)
 723     )
 724     output.add_item(item)
 725     spine.append(item)
 726
 727     output.toc.append(
 728         epub.Link(
 729             "last.xhtml",
 730             "Strona redakcyjna",
 731             "last"
 732         )
 733     )
 734     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
 735                      outputtype=output_type)
 736     chars.update(used_chars(html_tree.getroot()))
 737     item = epub.EpubItem(
 738         uid="last",
 739         file_name="last.xhtml",
 740         media_type="application/xhtml+xml",
 741         content=squeeze_whitespace(etree.tostring(
 742             html_tree, pretty_print=True, xml_declaration=True,
 743             encoding="utf-8",
 744             doctype='<!DOCTYPE html>'
 745         ))
 746     )
 747     output.add_item(item)
 748     spine.append(item)
 749
 750     if not flags or 'without-fonts' not in flags:
 751         # strip fonts
 752         tmpdir = mkdtemp('-librarian-epub')
 753         try:
 754             cwd = os.getcwd()
 755         except OSError:
 756             cwd = None
 757
 758         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
 759                               'font-optimizer'))
 760         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
 761                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
 762             optimizer_call = ['perl', 'subset.pl', '--chars',
 763                               ''.join(chars).encode('utf-8'),
 764                               get_resource('fonts/' + fname),
 765                               os.path.join(tmpdir, fname)]
 766             env = {"PERL_USE_UNSAFE_INC": "1"}
 767             if verbose:
 768                 print("Running font-optimizer")
 769                 subprocess.check_call(optimizer_call, env=env)
 770             else:
 771                 dev_null = open(os.devnull, 'w')
 772                 subprocess.check_call(optimizer_call, stdout=dev_null,
 773                                       stderr=dev_null, env=env)
 774             with open(os.path.join(tmpdir, fname), 'rb') as f:
 775                 output.add_item(
 776                     epub.EpubItem(
 777                         uid=fname,
 778                         file_name=fname,
 779                         media_type="font/ttf",
 780                         content=f.read()
 781                     )
 782                 )
 783         rmtree(tmpdir)
 784         if cwd is not None:
 785             os.chdir(cwd)
 786
 787     remove_empty_lists_from_toc(output.toc)
 788
 789     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
 790                                      delete=False)
 791     output_file.close()
 792     epub.write_epub(output_file.name, output, {'epub3_landmark': False})
 793     return OutputFile.from_filename(output_file.name)