src/librarian/epub.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import os.path
  10 import re
  11 import subprocess
  12 import six
  13 from copy import deepcopy
  14 from mimetypes import guess_type
  15
  16 from ebooklib import epub
  17 from lxml import etree
  18 from PIL import Image
  19 from tempfile import mkdtemp, NamedTemporaryFile
  20 from shutil import rmtree
  21
  22 from librarian import RDFNS, WLNS, DCNS, OutputFile
  23 from librarian.cover import make_cover
  24
  25 from librarian import functions, get_resource
  26
  27 from librarian.hyphenator import Hyphenator
  28
  29 functions.reg_person_name()
  30
  31
  32 def squeeze_whitespace(s):
  33     return re.sub(b'\\s+', b' ', s)
  34
  35
  36 def set_hyph_language(source_tree):
  37     bibl_lng = etree.XPath('//dc:language//text()',
  38                            namespaces={'dc': str(DCNS)})(source_tree)
  39     short_lng = functions.lang_code_3to2(bibl_lng[0])
  40     try:
  41         return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
  42                                        short_lng + '.dic'))
  43     except:
  44         pass
  45
  46
  47 def hyphenate_and_fix_conjunctions(source_tree, hyph):
  48     texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
  49     for t in texts:
  50         parent = t.getparent()
  51         if hyph is not None:
  52             newt = ''
  53             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
  54             for w in wlist:
  55                 newt += hyph.inserted(w, u'\u00AD')
  56         else:
  57             newt = t
  58         newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
  59         if t.is_text:
  60             parent.text = newt
  61         elif t.is_tail:
  62             parent.tail = newt
  63
  64
  65 def inner_xml(node):
  66     """ returns node's text and children as a string
  67
  68     >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
  69     x<b>y</b>z
  70     """
  71
  72     nt = node.text if node.text is not None else ''
  73     return ''.join(
  74         [nt] + [etree.tostring(child, encoding='unicode') for child in node]
  75     )
  76
  77
  78 def set_inner_xml(node, text):
  79     """ sets node's text and children from a string
  80
  81     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
  82     >>> set_inner_xml(e, 'x<b>y</b>z')
  83     >>> print(etree.tostring(e, encoding='unicode'))
  84     <a>x<b>y</b>z</a>
  85     """
  86
  87     p = etree.fromstring('<x>%s</x>' % text)
  88     node.text = p.text
  89     node[:] = p[:]
  90
  91
  92 def node_name(node):
  93     """ Find out a node's name
  94
  95     >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
  96     XYZ
  97     """
  98
  99     tempnode = deepcopy(node)
 100
 101     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
 102         for e in tempnode.findall('.//%s' % p):
 103             t = e.tail
 104             e.clear()
 105             e.tail = t
 106     etree.strip_tags(tempnode, '*')
 107     return tempnode.text
 108
 109
 110 def xslt(xml, sheet, **kwargs):
 111     if isinstance(xml, etree._Element):
 112         xml = etree.ElementTree(xml)
 113     with open(sheet) as xsltf:
 114         transform = etree.XSLT(etree.parse(xsltf))
 115         params = dict(
 116             (key, transform.strparam(value))
 117             for key, value in kwargs.items()
 118         )
 119         return transform(xml, **params)
 120
 121
 122 def replace_characters(node):
 123     def replace_chars(text):
 124         if text is None:
 125             return None
 126         return text.replace(u"\ufeff", u"")\
 127                    .replace("---", u"\u2014")\
 128                    .replace("--", u"\u2013")\
 129                    .replace(",,", u"\u201E")\
 130                    .replace('"', u"\u201D")\
 131                    .replace("'", u"\u2019")
 132     if node.tag in ('uwaga', 'extra'):
 133         t = node.tail
 134         node.clear()
 135         node.tail = t
 136     node.text = replace_chars(node.text)
 137     node.tail = replace_chars(node.tail)
 138     for child in node:
 139         replace_characters(child)
 140
 141
 142 def find_annotations(annotations, source, part_no):
 143     for child in source:
 144         if child.tag in ('pe', 'pa', 'pt', 'pr'):
 145             annotation = deepcopy(child)
 146             number = str(len(annotations) + 1)
 147             annotation.set('number', number)
 148             annotation.set('part', str(part_no))
 149             annotation.tail = ''
 150             annotations.append(annotation)
 151             tail = child.tail
 152             child.clear()
 153             child.tail = tail
 154             child.text = number
 155         if child.tag not in ('extra', 'uwaga'):
 156             find_annotations(annotations, child, part_no)
 157
 158
 159 class Stanza(object):
 160     """
 161     Converts / verse endings into verse elements in a stanza.
 162
 163     Slashes may only occur directly in the stanza. Any slashes in subelements
 164     will be ignored, and the subelements will be put inside verse elements.
 165
 166     >>> s = etree.fromstring(
 167     ...         "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
 168     ...     )
 169     >>> Stanza(s).versify()
 170     >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
 171     <strofa>
 172       <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
 173       <wers_normalny>b<x>x/
 174     y</x>c</wers_normalny>
 175       <wers_normalny>d</wers_normalny>
 176     </strofa>
 177
 178     """
 179     def __init__(self, stanza_elem):
 180         self.stanza = stanza_elem
 181         self.verses = []
 182         self.open_verse = None
 183
 184     def versify(self):
 185         self.push_text(self.stanza.text)
 186         for elem in self.stanza:
 187             self.push_elem(elem)
 188             self.push_text(elem.tail)
 189         tail = self.stanza.tail
 190         self.stanza.clear()
 191         self.stanza.tail = tail
 192         self.stanza.extend(
 193             verse for verse in self.verses
 194             if verse.text or len(verse) > 0
 195         )
 196
 197     def open_normal_verse(self):
 198         self.open_verse = self.stanza.makeelement("wers_normalny")
 199         self.verses.append(self.open_verse)
 200
 201     def get_open_verse(self):
 202         if self.open_verse is None:
 203             self.open_normal_verse()
 204         return self.open_verse
 205
 206     def push_text(self, text):
 207         if not text:
 208             return
 209         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
 210             if i:
 211                 self.open_normal_verse()
 212             if not verse_text.strip():
 213                 continue
 214             verse = self.get_open_verse()
 215             if len(verse):
 216                 verse[-1].tail = (verse[-1].tail or "") + verse_text
 217             else:
 218                 verse.text = (verse.text or "") + verse_text
 219
 220     def push_elem(self, elem):
 221         if elem.tag.startswith("wers"):
 222             verse = deepcopy(elem)
 223             verse.tail = None
 224             self.verses.append(verse)
 225             self.open_verse = verse
 226         else:
 227             appended = deepcopy(elem)
 228             appended.tail = None
 229             self.get_open_verse().append(appended)
 230
 231
 232 def replace_by_verse(tree):
 233     """ Find stanzas and create new verses in place of a '/' character """
 234
 235     stanzas = tree.findall('.//' + WLNS('strofa'))
 236     for stanza in stanzas:
 237         Stanza(stanza).versify()
 238
 239
 240 def used_chars(element):
 241     """ Lists characters used in an ETree Element """
 242     chars = set((element.text or '') + (element.tail or ''))
 243     for child in element:
 244         chars = chars.union(used_chars(child))
 245     return chars
 246
 247
 248 def chop(main_text):
 249     """ divide main content of the XML file into chunks """
 250
 251     # prepare a container for each chunk
 252     part_xml = etree.Element('utwor')
 253     etree.SubElement(part_xml, 'master')
 254     main_xml_part = part_xml[0]  # master
 255
 256     last_node_part = False
 257
 258     # The below loop are workaround for a problem with epubs
 259     # in drama ebooks without acts.
 260     is_scene = False
 261     is_act = False
 262     for one_part in main_text:
 263         name = one_part.tag
 264         if name == 'naglowek_scena':
 265             is_scene = True
 266         elif name == 'naglowek_akt':
 267             is_act = True
 268
 269     for one_part in main_text:
 270         name = one_part.tag
 271         if is_act is False and is_scene is True:
 272             if name == 'naglowek_czesc':
 273                 yield part_xml
 274                 last_node_part = True
 275                 main_xml_part[:] = [deepcopy(one_part)]
 276             elif not last_node_part and name == "naglowek_scena":
 277                 yield part_xml
 278                 main_xml_part[:] = [deepcopy(one_part)]
 279             else:
 280                 main_xml_part.append(deepcopy(one_part))
 281                 last_node_part = False
 282         else:
 283             if name == 'naglowek_czesc':
 284                 yield part_xml
 285                 last_node_part = True
 286                 main_xml_part[:] = [deepcopy(one_part)]
 287             elif (not last_node_part
 288                   and name in (
 289                       "naglowek_rozdzial", "naglowek_akt", "srodtytul"
 290                   )):
 291                 yield part_xml
 292                 main_xml_part[:] = [deepcopy(one_part)]
 293             else:
 294                 main_xml_part.append(deepcopy(one_part))
 295                 last_node_part = False
 296     yield part_xml
 297
 298
 299 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
 300                     _empty_html_static=[]):
 301     """
 302     Transforms one chunk, returns a HTML string, a TOC object
 303     and a set of used characters.
 304     """
 305
 306     toc = []
 307     for element in chunk_xml[0]:
 308         if element.tag == "naglowek_czesc":
 309             toc.append(
 310                 (
 311                     epub.Link(
 312                         "part%d.xhtml#book-text" % chunk_no,
 313                         node_name(element),
 314                         "part%d-text" % chunk_no
 315                     ),
 316                     []
 317                 )
 318             )
 319         elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
 320             toc.append(
 321                 (
 322                     epub.Link(
 323                         "part%d.xhtml" % chunk_no,
 324                         node_name(element),
 325                         "part%d" % chunk_no
 326                     ),
 327                     []
 328                 )
 329             )
 330         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
 331             if not toc:
 332                 toc.append(
 333                     (
 334                         epub.Link(
 335                             "part%d.xhtml" % chunk_no,
 336                             " ",
 337                             "part%d" % chunk_no
 338                         ),
 339                         []
 340                     )
 341                 )
 342
 343             subnumber = len(toc[-1][1])
 344             toc[-1][1].append(
 345                 epub.Link(
 346                     "part%d.xhtml#sub%d" % (chunk_no, subnumber),
 347                     node_name(element),
 348                     "part%d-sub%d" % (chunk_no, subnumber)
 349                 )
 350             )
 351             element.set('sub', six.text_type(subnumber))
 352     if empty:
 353         if not _empty_html_static:
 354             with open(get_resource('epub/emptyChunk.xhtml')) as f:
 355                 _empty_html_static.append(f.read())
 356         chars = set()
 357         output_html = _empty_html_static[0]
 358     else:
 359         find_annotations(annotations, chunk_xml, chunk_no)
 360         replace_by_verse(chunk_xml)
 361         html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
 362         chars = used_chars(html_tree.getroot())
 363         output_html = etree.tostring(
 364             html_tree, pretty_print=True, xml_declaration=True,
 365             encoding="utf-8",
 366             doctype='<!DOCTYPE html>'
 367         )
 368     return output_html, toc, chars
 369
 370
 371 def remove_empty_lists_from_toc(toc):
 372     for i, e in enumerate(toc):
 373         if isinstance(e, tuple):
 374             if e[1]:
 375                 remove_empty_lists_from_toc(e[1])
 376             else:
 377                 toc[i] = e[0]
 378
 379
 380 def transform(wldoc, verbose=False, style=None,
 381               sample=None, cover=None, flags=None, hyphenate=False,
 382               base_url='file://./', output_type='epub'):
 383     """ produces a EPUB file
 384
 385     sample=n: generate sample e-book (with at least n paragraphs)
 386     cover: a cover.Cover factory or True for default
 387     flags: less-advertising, without-fonts, working-copy
 388     """
 389
 390     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
 391         """ processes one input file and proceeds to its children """
 392
 393         replace_characters(wldoc.edoc.getroot())
 394
 395         hyphenator = set_hyph_language(
 396             wldoc.edoc.getroot()
 397         ) if hyphenate else None
 398         hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
 399
 400         # every input file will have a TOC entry,
 401         # pointing to starting chunk
 402         toc = [
 403             (
 404                 epub.Link(
 405                     "part%d.xhtml" % chunk_counter,
 406                     wldoc.book_info.title,
 407                     "path%d-start" % chunk_counter
 408                 ),
 409                 []
 410             )
 411         ]
 412         chars = set()
 413         if first:
 414             # write book title page
 415             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
 416                              outputtype=output_type)
 417             chars = used_chars(html_tree.getroot())
 418             html_string = etree.tostring(
 419                 html_tree, pretty_print=True, xml_declaration=True,
 420                 encoding="utf-8",
 421                 doctype='<!DOCTYPE html>'
 422             )
 423             item = epub.EpubItem(
 424                 uid="titlePage",
 425                 file_name="title.xhtml",
 426                 media_type="application/xhtml+xml",
 427                 content=squeeze_whitespace(html_string)
 428             )
 429             spine.append(item)
 430             output.add_item(item)
 431             # add a title page TOC entry
 432             toc[-1][1].append(
 433                 epub.Link(
 434                     "title.xhtml",
 435                     "Strona tytułowa",
 436                     "title",
 437                 )
 438             )
 439
 440             item = epub.EpubNav()
 441             toc[-1][1].append(
 442                 epub.Link(
 443                     "nav.xhtml",
 444                     "Spis treści",
 445                     "nav"
 446                 )
 447             )
 448             output.add_item(item)
 449             spine.append(item)
 450
 451             toc[-1][1].append(
 452                 epub.Link(
 453                     "part1.xhtml",
 454                     "Początek utworu",
 455                     "part1"
 456                 )
 457             )
 458
 459         elif wldoc.book_info.parts:
 460             # write title page for every parent
 461             if sample is not None and sample <= 0:
 462                 chars = set()
 463                 html_string = open(
 464                     get_resource('epub/emptyChunk.xhtml')).read()
 465             else:
 466                 html_tree = xslt(wldoc.edoc,
 467                                  get_resource('epub/xsltChunkTitle.xsl'))
 468                 chars = used_chars(html_tree.getroot())
 469                 html_string = etree.tostring(
 470                     html_tree, pretty_print=True, xml_declaration=True,
 471                     encoding="utf-8",
 472                     doctype='<!DOCTYPE html>'
 473                 )
 474             item = epub.EpubItem(
 475                 uid="part%d" % chunk_counter,
 476                 file_name="part%d.xhtml" % chunk_counter,
 477                 media_type="application/xhtml+xml",
 478                 content=squeeze_whitespace(html_string)
 479             )
 480             output.add_item(item)
 481             spine.append(item)
 482
 483             chunk_counter += 1
 484
 485         if len(wldoc.edoc.getroot()) > 1:
 486             # rdf before style master
 487             main_text = wldoc.edoc.getroot()[1]
 488         else:
 489             # rdf in style master
 490             main_text = wldoc.edoc.getroot()[0]
 491             if main_text.tag == RDFNS('RDF'):
 492                 main_text = None
 493
 494         if main_text is not None:
 495             for chunk_xml in chop(main_text):
 496                 empty = False
 497                 if sample is not None:
 498                     if sample <= 0:
 499                         empty = True
 500                     else:
 501                         sample -= len(chunk_xml.xpath(
 502                             '//strofa|//akap|//akap_cd|//akap_dialog'
 503                         ))
 504                 chunk_html, chunk_toc, chunk_chars = transform_chunk(
 505                     chunk_xml, chunk_counter, annotations, empty)
 506
 507                 toc[-1][1].extend(chunk_toc)
 508                 chars = chars.union(chunk_chars)
 509                 item = epub.EpubItem(
 510                     uid="part%d" % chunk_counter,
 511                     file_name="part%d.xhtml" % chunk_counter,
 512                     media_type="application/xhtml+xml",
 513                     content=squeeze_whitespace(chunk_html)
 514                 )
 515                 output.add_item(item)
 516                 spine.append(item)
 517                 chunk_counter += 1
 518
 519         for child in wldoc.parts():
 520             child_toc, chunk_counter, chunk_chars, sample = transform_file(
 521                 child, chunk_counter, first=False, sample=sample)
 522             toc[-1][1].extend(child_toc)
 523             chars = chars.union(chunk_chars)
 524
 525         return toc, chunk_counter, chars, sample
 526
 527     document = deepcopy(wldoc)
 528     del wldoc
 529
 530     if flags:
 531         for flag in flags:
 532             document.edoc.getroot().set(flag, 'yes')
 533
 534     document.clean_ed_note()
 535     document.clean_ed_note('abstrakt')
 536
 537     # add editors info
 538     editors = document.editors()
 539     if editors:
 540         document.edoc.getroot().set('editors', u', '.join(sorted(
 541             editor.readable() for editor in editors)))
 542     if document.book_info.funders:
 543         document.edoc.getroot().set('funders', u', '.join(
 544             document.book_info.funders))
 545     if document.book_info.thanks:
 546         document.edoc.getroot().set('thanks', document.book_info.thanks)
 547
 548     output = epub.EpubBook()
 549     output.set_identifier(six.text_type(document.book_info.url))
 550     output.set_language(functions.lang_code_3to2(document.book_info.language))
 551     output.set_title(document.book_info.title)
 552     for i, author in enumerate(document.book_info.authors):
 553         output.add_author(
 554             author.readable(),
 555             file_as=six.text_type(author),
 556             uid='creator{}'.format(i)
 557         )
 558     for translator in document.book_info.translators:
 559         output.add_author(
 560             translator.readable(),
 561             file_as=six.text_type(translator),
 562             role='trl',
 563             uid='translator{}'.format(i)
 564         )
 565     for publisher in document.book_info.publisher:
 566         output.add_metadata("DC", "publisher", publisher)
 567     output.add_metadata("DC", "date", document.book_info.created_at)
 568
 569     output.guide.append({
 570         "type": "text",
 571         "title": "Początek",
 572         "href": "part1.xhtml"
 573     })
 574
 575     output.add_item(epub.EpubNcx())
 576
 577     spine = output.spine
 578
 579     functions.reg_mathml_epub(output)
 580
 581     # FIXME
 582     for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
 583         url = six.moves.urllib.parse.urljoin(
 584             base_url,
 585             ilustr.get('src')
 586         )
 587         with six.moves.urllib.request.urlopen(url) as imgfile:
 588             img = Image.open(imgfile)
 589
 590         th_format, ext, media_type = {
 591             'GIF': ('GIF', 'gif', 'image/gif'),
 592             'PNG': ('PNG', 'png', 'image/png'),
 593         }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
 594
 595         width = 1200
 596         if img.size[0] < width:
 597             th = img
 598         else:
 599             th = img.resize((width, round(width * img.size[1] / img.size[0])))
 600
 601         buffer = six.BytesIO()
 602         th.save(buffer, format=th_format)
 603
 604         file_name = 'image%d.%s' % (i, ext)
 605         ilustr.set('src', file_name)
 606         output.add_item(
 607             epub.EpubItem(
 608                 uid='image%s' % i,
 609                 file_name=file_name,
 610                 media_type=media_type,
 611                 content=buffer.getvalue()
 612             )
 613         )
 614
 615     # write static elements
 616
 617     with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
 618         output.add_item(
 619             epub.EpubItem(
 620                 uid="logo_wolnelektury.png",
 621                 file_name="logo_wolnelektury.png",
 622                 media_type="image/png",
 623                 content=f.read()
 624             )
 625         )
 626     with open(get_resource('res/jedenprocent.png'), 'rb') as f:
 627         output.add_item(
 628             epub.EpubItem(
 629                 uid="jedenprocent",
 630                 file_name="jedenprocent.png",
 631                 media_type="image/png",
 632                 content=f.read()
 633             )
 634         )
 635
 636     if not style:
 637         style = get_resource('epub/style.css')
 638     with open(style, 'rb') as f:
 639         output.add_item(
 640             epub.EpubItem(
 641                 uid="style",
 642                 file_name="style.css",
 643                 media_type="text/css",
 644                 content=f.read()
 645             )
 646         )
 647
 648     if cover:
 649         if cover is True:
 650             cover = make_cover
 651
 652         cover_file = six.BytesIO()
 653         bound_cover = cover(document.book_info)
 654         bound_cover.save(cover_file)
 655         cover_name = 'cover.%s' % bound_cover.ext()
 656
 657         output.set_cover(
 658             file_name=cover_name,
 659             content=cover_file.getvalue(),
 660         )
 661         spine.append('cover')
 662         output.guide.append({
 663             "type": "cover",
 664             "href": "cover.xhtml",
 665             "title": "Okładka",
 666         })
 667
 668         del cover_file
 669
 670         if bound_cover.uses_dc_cover:
 671             if document.book_info.cover_by:
 672                 document.edoc.getroot().set('data-cover-by',
 673                                             document.book_info.cover_by)
 674             if document.book_info.cover_source:
 675                 document.edoc.getroot().set('data-cover-source',
 676                                             document.book_info.cover_source)
 677
 678     annotations = etree.Element('annotations')
 679
 680     toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
 681     output.toc = toc[0][1]
 682
 683     # Last modifications in container files and EPUB creation
 684     if len(annotations) > 0:
 685         output.toc.append(
 686             epub.Link(
 687                 "annotations.xhtml",
 688                 "Przypisy",
 689                 "annotations"
 690             )
 691         )
 692         replace_by_verse(annotations)
 693         html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
 694         chars = chars.union(used_chars(html_tree.getroot()))
 695
 696         item = epub.EpubItem(
 697             uid="annotations",
 698             file_name="annotations.xhtml",
 699             media_type="application/xhtml+xml",
 700             content=etree.tostring(
 701                 html_tree, pretty_print=True, xml_declaration=True,
 702                 encoding="utf-8",
 703                 doctype='<!DOCTYPE html>'
 704             )
 705         )
 706         output.add_item(item)
 707         spine.append(item)
 708
 709     output.toc.append(
 710         epub.Link(
 711             "support.xhtml",
 712             "Wesprzyj Wolne Lektury",
 713             "support"
 714         )
 715     )
 716     with open(get_resource('epub/support.xhtml'), 'rb') as f:
 717         html_string = f.read()
 718     chars.update(used_chars(etree.fromstring(html_string)))
 719     item = epub.EpubItem(
 720         uid="support",
 721         file_name="support.xhtml",
 722         media_type="application/xhtml+xml",
 723         content=squeeze_whitespace(html_string)
 724     )
 725     output.add_item(item)
 726     spine.append(item)
 727
 728     output.toc.append(
 729         epub.Link(
 730             "last.xhtml",
 731             "Strona redakcyjna",
 732             "last"
 733         )
 734     )
 735     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
 736                      outputtype=output_type)
 737     chars.update(used_chars(html_tree.getroot()))
 738     item = epub.EpubItem(
 739         uid="last",
 740         file_name="last.xhtml",
 741         media_type="application/xhtml+xml",
 742         content=squeeze_whitespace(etree.tostring(
 743             html_tree, pretty_print=True, xml_declaration=True,
 744             encoding="utf-8",
 745             doctype='<!DOCTYPE html>'
 746         ))
 747     )
 748     output.add_item(item)
 749     spine.append(item)
 750
 751     if not flags or 'without-fonts' not in flags:
 752         # strip fonts
 753         tmpdir = mkdtemp('-librarian-epub')
 754         try:
 755             cwd = os.getcwd()
 756         except OSError:
 757             cwd = None
 758
 759         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
 760                               'font-optimizer'))
 761         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
 762                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
 763             optimizer_call = ['perl', 'subset.pl', '--chars',
 764                               ''.join(chars).encode('utf-8'),
 765                               get_resource('fonts/' + fname),
 766                               os.path.join(tmpdir, fname)]
 767             env = {"PERL_USE_UNSAFE_INC": "1"}
 768             if verbose:
 769                 print("Running font-optimizer")
 770                 subprocess.check_call(optimizer_call, env=env)
 771             else:
 772                 dev_null = open(os.devnull, 'w')
 773                 subprocess.check_call(optimizer_call, stdout=dev_null,
 774                                       stderr=dev_null, env=env)
 775             with open(os.path.join(tmpdir, fname), 'rb') as f:
 776                 output.add_item(
 777                     epub.EpubItem(
 778                         uid=fname,
 779                         file_name=fname,
 780                         media_type="font/ttf",
 781                         content=f.read()
 782                     )
 783                 )
 784         rmtree(tmpdir)
 785         if cwd is not None:
 786             os.chdir(cwd)
 787
 788     remove_empty_lists_from_toc(output.toc)
 789
 790     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
 791                                      delete=False)
 792     output_file.close()
 793     epub.write_epub(output_file.name, output, {'epub3_landmark': False})
 794     return OutputFile.from_filename(output_file.name)