src/librarian/builders/epub.py

   1 from datetime import date
   2 import os
   3 import tempfile
   4 from ebooklib import epub
   5 from lxml import etree
   6 import six
   7 from librarian import functions, OutputFile, get_resource, XHTMLNS
   8 from librarian.cover import make_cover
   9 from librarian.embeds.mathml import MathML
  10 import librarian.epub
  11 from librarian.fonts import strip_font
  12
  13
  14
  15
  16 class Xhtml:
  17     def __init__(self):
  18         self.element = etree.XML('''<html xmlns="http://www.w3.org/1999/xhtml"><head><link rel="stylesheet" href="style.css" type="text/css"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>WolneLektury.pl</title></head><body/></html>''')
  19
  20     @property
  21     def title(self):
  22         return self.element.find('.//' + XHTMLNS('title'))
  23
  24     @property
  25     def body(self):
  26         return self.element.find('.//' + XHTMLNS('body'))
  27
  28
  29 class Builder:
  30     file_extension = None
  31
  32     def __init__(self, base_url=None, fundraising=None):
  33         self._base_url = base_url or 'file:///home/rczajka/for/fnp/librarian/temp~/maly/img/'
  34         self.fundraising = fundraising
  35         self.footnotes = etree.Element('div', id='footnotes')
  36
  37         self.cursors = {
  38 #            None: None,
  39 #            'header': self.header,
  40             'footnotes': self.footnotes,
  41         }
  42         self.current_cursors = []
  43
  44         self.toc_base = 0
  45
  46     @property
  47     def cursor(self):
  48         return self.current_cursors[-1]
  49
  50     def enter_fragment(self, fragment):
  51         self.current_cursors.append(self.cursors[fragment])
  52
  53     def exit_fragment(self):
  54         self.current_cursors.pop()
  55
  56     def create_fragment(self, name, element):
  57         assert name not in self.cursors
  58         self.cursors[name] = element
  59
  60     def forget_fragment(self, name):
  61         del self.cursors[name]
  62
  63
  64
  65     @property
  66     def base_url(self):
  67         if self._base_url is not None:
  68             return self._base_url
  69         else:
  70             return 'https://wolnelektury.pl/media/book/pictures/{}/'.format(self.document.meta.url.slug)
  71
  72
  73     # Base URL should be on Document level, not builder.
  74     def build(self, document, **kwargs):
  75         """Should return an OutputFile with the output."""
  76         raise NotImplementedError()
  77
  78
  79 class EpubBuilder(Builder):
  80     file_extension = 'epub'
  81
  82     def __init__(self, *args, **kwargs):
  83         self.chars = set()
  84         self.fundr = 0
  85         super().__init__(*args, **kwargs)
  86
  87     def build(self, document, **kwargs):
  88         # replace_characters -- nie, robimy to na poziomie elementów
  89
  90         # hyphenator (\00ad w odp. miejscach) -- jeśli już, to też powinno to się dziać na poziomie elementów
  91         # spójniki (\u00a0 po)-- jeśli już, to na poziomie elementów
  92         # trick na dywizy: &#xad;&#8288;-
  93
  94         # do toc trafia:
  95         #   początek z KAŻDEGO PLIKU xml
  96
  97         # zliczamy zbiór użytych znaków
  98
  99         # flagi:
 100         # mieliśmy taką flagę less-advertising, używaną tylko dla Prestigio; już nie używamy.
 101
 102         # @editors = document.editors() (jako str)
 103         # @funders = join(meta.funders)
 104         # @thanks = meta.thanks
 105
 106
 107         self.output = output = epub.EpubBook()
 108         self.document = document
 109
 110         self.set_metadata()
 111
 112
 113         self.add_cover()
 114
 115         self.add_title_page()
 116         self.add_toc()
 117
 118
 119
 120         self.start_chunk()
 121
 122         self.add_toc_entry(
 123             None,
 124             'Początek utworu', # i18n
 125             0
 126         )
 127         self.output.guide.append({
 128             "type": "text",
 129             "title": "Początek",
 130             "href": "part1.xhtml"
 131         })
 132
 133
 134         self.build_document(self.document)
 135
 136
 137         self.close_chunk()
 138
 139         self.add_annotations()
 140         self.add_support_page()
 141         self.add_last_page()
 142
 143         if self.fundraising:
 144             e = len(self.output.spine) - 3 - 3
 145             nfunds = len(self.fundraising)
 146             if e > 4 * nfunds:
 147                 nfunds *= 2
 148
 149             # COUNTING CHARACTERS?
 150             for f in range(nfunds):
 151                 spine_index = int(4 + (f / nfunds * e) + f)
 152
 153                 h = Xhtml()
 154                 h.body.append(
 155                     etree.XML('<div id="book-text"><div class="fundraising">' + self.fundraising[f % len(self.fundraising)] + '</div></div>')
 156                 )
 157                 self.add_html(h.element, file_name='fund%d.xhtml' % f, spine=spine_index)
 158
 159         self.add_fonts()
 160
 161         output_file = tempfile.NamedTemporaryFile(
 162             prefix='librarian', suffix='.epub',
 163             delete=False)
 164         output_file.close()
 165         epub.write_epub(output_file.name, output, {'epub3_landmark': False})
 166         return OutputFile.from_filename(output_file.name)
 167
 168     def build_document(self, document):
 169         self.toc_precedences = []
 170
 171         self.start_chunk()
 172
 173
 174         document.tree.getroot().epub_build(self)
 175         if document.meta.parts:
 176             self.start_chunk()
 177
 178             self.start_element('div', {'class': 'title-page'})
 179             self.start_element('h1', {'class': 'title'})
 180             self.push_text(document.meta.title)
 181             self.end_element()
 182             self.end_element()
 183
 184             ######
 185             # 160
 186             # translators
 187             # working copy?
 188             # ta lektura
 189             # tanks
 190             # utwor opracowany
 191             # isbn
 192             # logo
 193
 194             for child in document.children:
 195                 self.start_chunk()
 196                 self.add_toc_entry(None, child.meta.title, 0)
 197                 self.build_document(child)
 198
 199         self.shift_toc_base()
 200
 201
 202     def add_title_page(self):
 203         html = Xhtml()
 204         html.title.text = "Strona tytułowa"
 205         bt = etree.SubElement(html.body, 'div', **{'id': 'book-text'})
 206         tp = etree.SubElement(bt, 'div', **{'class': 'title-page'})
 207
 208         # Tak jak jest teraz – czy może być jednocześnie
 209         # no „autor_utworu”
 210         # i „dzieło nadrzędne”
 211         # wcześniej mogło być dzieło nadrzędne,
 212
 213         e = self.document.tree.find('//autor_utworu')
 214         if e is not None:
 215             etree.SubElement(tp, 'h2', **{'class': 'author'}).text = e.raw_printable_text()
 216         e = self.document.tree.find('//nazwa_utworu')
 217         if e is not None:
 218             etree.SubElement(tp, 'h1', **{'class': 'title'}).text = e.raw_printable_text()
 219
 220         if not len(tp):
 221             for author in self.document.meta.authors:
 222                 etree.SubElement(tp, 'h2', **{'class': 'author'}).text = author.readable()
 223             etree.SubElement(tp, 'h1', **{'class': 'title'}).text = self.document.meta.title
 224
 225 #                <xsl:apply-templates select="//nazwa_utworu | //podtytul | //dzielo_nadrzedne" mode="poczatek"/>
 226 #        else:
 227 #                            <xsl:apply-templates select="//dc:creator" mode="poczatek"/>
 228 #                <xsl:apply-templates select="//dc:title | //podtytul | //dzielo_nadrzedne" mode="poczatek"/>
 229
 230         etree.SubElement(tp, 'p', **{"class": "info"}).text = '\u00a0'
 231
 232         if self.document.meta.translators:
 233             p = etree.SubElement(tp, 'p', **{'class': 'info'})
 234             p.text = 'tłum. ' + ', '.join(t.readable() for t in self.document.meta.translators)
 235
 236         #<p class="info">[Kopia robocza]</p>
 237
 238         p = etree.XML("""<p class="info">
 239               <a>Ta lektura</a>, podobnie jak tysiące innych, jest dostępna on-line na stronie
 240               <a href="http://www.wolnelektury.pl/">wolnelektury.pl</a>.
 241             </p>""")
 242         p[0].attrib['href'] = str(self.document.meta.url)
 243         tp.append(p)
 244
 245         if self.document.meta.thanks:
 246             etree.SubElement(tp, 'p', **{'class': 'info'}).text = self.document.meta.thanks
 247
 248         tp.append(etree.XML("""
 249           <p class="info">
 250             Utwór opracowany został w&#160;ramach projektu<a href="http://www.wolnelektury.pl/"> Wolne Lektury</a> przez<a href="http://www.nowoczesnapolska.org.pl/"> fundację Nowoczesna Polska</a>.
 251           </p>
 252         """))
 253
 254         if self.document.meta.isbn_epub:
 255             etree.SubElement(tp, 'p', **{"class": "info"}).text = self.document.meta.isbn_epub
 256
 257         tp.append(etree.XML("""<p class="footer info">
 258             <a href="http://www.wolnelektury.pl/"><img src="logo_wolnelektury.png" alt="WolneLektury.pl" /></a>
 259         </p>"""))
 260
 261         self.add_html(
 262             html.element,
 263             file_name='title.xhtml',
 264             spine=True,
 265             toc='Strona tytułowa' # TODO: i18n
 266         )
 267
 268         self.add_file(
 269             get_resource('res/wl-logo-small.png'),
 270             file_name='logo_wolnelektury.png',
 271             media_type='image/png'
 272         )
 273
 274     def set_metadata(self):
 275         self.output.set_identifier(
 276             str(self.document.meta.url))
 277         self.output.set_language(
 278             functions.lang_code_3to2(self.document.meta.language)
 279         )
 280         self.output.set_title(self.document.meta.title)
 281
 282         for i, author in enumerate(self.document.meta.authors):
 283             self.output.add_author(
 284                 author.readable(),
 285                 file_as=six.text_type(author),
 286                 uid='creator{}'.format(i)
 287             )
 288         for translator in self.document.meta.translators:
 289             self.output.add_author(
 290                 translator.readable(),
 291                 file_as=six.text_type(translator),
 292                 role='trl',
 293                 uid='translator{}'.format(i)
 294             )
 295         for publisher in self.document.meta.publisher:
 296             self.output.add_metadata("DC", "publisher", publisher)
 297
 298         self.output.add_metadata("DC", "date", self.document.meta.created_at)
 299
 300
 301
 302
 303     def add_toc(self):
 304         item = epub.EpubNav()
 305         self.output.add_item(item)
 306         self.output.spine.append(item)
 307         self.output.add_item(epub.EpubNcx())
 308
 309         self.output.toc.append(
 310             epub.Link(
 311                 "nav.xhtml",
 312                 "Spis treści",
 313                 "nav"
 314             )
 315         )
 316
 317
 318
 319     def add_support_page(self):
 320         self.add_file(
 321             get_resource('epub/support.xhtml'),
 322             spine=True,
 323             toc='Wesprzyj Wolne Lektury'
 324         )
 325
 326         self.add_file(
 327             get_resource('res/jedenprocent.png'),
 328             media_type='image/png'
 329         )
 330         self.add_file(
 331             get_resource('epub/style.css'),
 332             media_type='text/css'
 333         )
 334
 335
 336     def add_file(self, path=None, content=None,
 337                  media_type='application/xhtml+xml',
 338                  file_name=None, uid=None,
 339                  spine=False, toc=None):
 340
 341         # update chars?
 342         # jakieś tam ścieśnianie białych znaków?
 343
 344         if content is None:
 345             with open(path, 'rb') as f:
 346                 content = f.read()
 347             if file_name is None:
 348                 file_name = path.rsplit('/', 1)[-1]
 349
 350         if uid is None:
 351             uid = file_name.split('.', 1)[0]
 352
 353         item = epub.EpubItem(
 354             uid=uid,
 355             file_name=file_name,
 356             media_type=media_type,
 357             content=content
 358         )
 359
 360         self.output.add_item(item)
 361         if spine:
 362             if spine is True:
 363                 self.output.spine.append(item)
 364             else:
 365                 self.output.spine.insert(spine, item)
 366
 367         if toc:
 368             self.output.toc.append(
 369                 epub.Link(
 370                     file_name,
 371                     toc,
 372                     uid
 373                 )
 374             )
 375
 376     def add_html(self, html_tree, **kwargs):
 377         html = etree.tostring(
 378             html_tree, pretty_print=True, xml_declaration=True,
 379             encoding="utf-8",
 380             doctype='<!DOCTYPE html>'
 381         )
 382
 383         html = librarian.epub.squeeze_whitespace(html)
 384
 385         self.add_file(
 386             content=html,
 387             **kwargs
 388         )
 389
 390
 391     def add_fonts(self):
 392         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
 393                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
 394             self.add_file(
 395                 content=strip_font(
 396                     get_resource('fonts/' + fname),
 397                     self.chars
 398                 ),
 399                 file_name=fname,
 400                 media_type='font/ttf'
 401             )
 402
 403     def start_chunk(self):
 404         if getattr(self, 'current_chunk', None) is not None:
 405             if not len(self.current_chunk):
 406                 return
 407             self.close_chunk()
 408         self.current_chunk = etree.Element(
 409             'div',
 410             id="book-text"
 411         )
 412         self.cursors[None] = self.current_chunk
 413         self.current_cursors.append(self.current_chunk)
 414
 415         self.section_number = 0
 416
 417
 418     def close_chunk(self):
 419         assert self.cursor is self.current_chunk
 420         ###### -- what if we're inside?
 421
 422         chunk_no = getattr(
 423             self,
 424             'chunk_counter',
 425             1
 426         )
 427         self.chunk_counter = chunk_no + 1
 428
 429         html = Xhtml()
 430         html.body.append(self.current_chunk)
 431
 432         self.add_html(
 433             ## html container from template.
 434             #self.current_chunk,
 435             html.element,
 436             file_name='part%d.xhtml' % chunk_no,
 437             spine=True,
 438
 439         )
 440         self.current_chunk = None
 441         self.current_cursors.pop()
 442
 443     def start_element(self, tag, attr):
 444         self.current_cursors.append(
 445             etree.SubElement(self.cursor, tag, **attr)
 446         )
 447
 448     def end_element(self):
 449         self.current_cursors.pop()
 450
 451     def push_text(self, text):
 452         self.chars.update(text)
 453         if len(self.cursor):
 454             self.cursor[-1].tail = (self.cursor[-1].tail or '') + text
 455         else:
 456             self.cursor.text = (self.cursor.text or '') + text
 457
 458
 459     def assign_image_number(self):
 460         image_number = getattr(self, 'image_number', 0)
 461         self.image_number = image_number + 1
 462         return image_number
 463
 464     def assign_footnote_number(self):
 465         number = getattr(self, 'footnote_number', 1)
 466         self.footnote_number = number + 1
 467         return number
 468
 469     def assign_section_number(self):
 470         number = getattr(self, 'section_number', 1)
 471         self.section_number = number + 1
 472         return number
 473
 474     def assign_mathml_number(self):
 475         number = getattr(self, 'mathml_number', 0)
 476         self.mathml_number = number + 1
 477         return number
 478
 479
 480     def add_toc_entry(self, fragment, name, precedence):
 481         if precedence:
 482             while self.toc_precedences and self.toc_precedences[-1] >= precedence:
 483                 self.toc_precedences.pop()
 484         else:
 485             self.toc_precedences = []
 486
 487         real_level = self.toc_base + len(self.toc_precedences)
 488         if precedence:
 489             self.toc_precedences.append(precedence)
 490         else:
 491             self.toc_base += 1
 492
 493         part_number = getattr(
 494             self,
 495             'chunk_counter',
 496             1
 497         )
 498         filename = 'part%d.xhtml' % part_number
 499         uid = filename.split('.')[0]
 500         if fragment:
 501             filename += '#' + fragment
 502             uid += '-' + fragment
 503
 504         toc = self.output.toc
 505         for l in range(1, real_level):
 506             if isinstance(toc[-1], epub.Link):
 507                 toc[-1] = [toc[-1], []]
 508             toc = toc[-1][1]
 509
 510         toc.append(
 511             epub.Link(
 512                 filename,
 513                 name,
 514                 uid
 515             )
 516         )
 517
 518     def shift_toc_base(self):
 519         self.toc_base -= 1
 520
 521
 522     def add_last_page(self):
 523         html = Xhtml()
 524         m = self.document.meta
 525
 526         html.title.text = 'Strona redakcyjna'
 527         d = etree.SubElement(html.body, 'div', id='book-text')
 528
 529         newp = lambda: etree.SubElement(d, 'p', {'class': 'info'})
 530
 531         p = newp()
 532         if m.license:
 533             p.text = """
 534                       Ten utwór jest udostępniony na licencji
 535                       """
 536             etree.SubElement(p, 'a', href=m.license).text = m.license_description
 537         else:
 538             p.text = """
 539                     Ten utwór nie jest objęty majątkowym prawem autorskim i znajduje się w domenie
 540                     publicznej, co oznacza że możesz go swobodnie wykorzystywać, publikować
 541                     i rozpowszechniać. Jeśli utwór opatrzony jest dodatkowymi materiałami
 542                     (przypisy, motywy literackie etc.), które podlegają prawu autorskiemu, to
 543                     te dodatkowe materiały udostępnione są na licencji
 544                     """
 545             a = etree.SubElement(p, "a", href="http://creativecommons.org/licenses/by-sa/3.0/")
 546             a.text = """Creative Commons
 547                     Uznanie Autorstwa – Na Tych Samych Warunkach 3.0 PL"""
 548             a.tail = "."
 549
 550
 551         p = newp()
 552         p.text = 'Źródło: '
 553         etree.SubElement(
 554             p, 'a', href=str(m.url),
 555             title=', '.join((
 556                 ', '.join(p.readable() for p in m.authors),
 557                 m.title
 558             ))
 559         ).text = str(m.url)
 560
 561         if m.source_name:
 562             newp().text = 'Tekst opracowany na podstawie: ' + m.source_name
 563
 564         newp().text = """
 565               Wydawca:
 566               """ + ", ".join(p for p in m.publisher)
 567
 568         if m.description:
 569             newp().text = m.description
 570
 571
 572         if m.editors:
 573             newp().text = 'Opracowanie redakcyjne i przypisy: %s.' % (
 574                 ', '.join(e.readable() for e in sorted(self.document.editors())))
 575
 576         if m.funders:
 577             etree.SubElement(d, 'p', {'class': 'minor-info'}).text = '''Publikację wsparli i wsparły:
 578             %s.''' % (', '.join(m.funders))
 579
 580         if m.cover_by:
 581             p = newp()
 582             p.text = 'Okładka na podstawie: '
 583             if m.cover_source:
 584                 etree.SubElement(
 585                     p,
 586                     'a',
 587                     href=m.cover_source
 588                 ).text = m.cover_by
 589             else:
 590                 p.text += m.cover_by
 591
 592         if m.isbn_epub:
 593             newp().text = m.isbn_epub
 594
 595         newp().text = '\u00a0'
 596
 597         p = newp()
 598         p.attrib['class'] = 'minor-info'
 599         p.text = '''
 600               Plik wygenerowany dnia '''
 601         span = etree.SubElement(p, 'span', id='file_date')
 602         span.text = str(date.today())
 603         span.tail = '''.
 604           '''
 605
 606         self.add_html(
 607             html.element,
 608             file_name='last.xhtml',
 609             toc='Strona redakcyjna',
 610             spine=True
 611         )
 612
 613
 614     def add_annotations(self):
 615         if not len(self.footnotes):
 616             return
 617
 618         html = Xhtml()
 619         html.title.text = 'Przypisy'
 620         d = etree.SubElement(
 621             etree.SubElement(
 622                 html.body,
 623                 'div',
 624                 id='book-text'
 625             ),
 626             'div',
 627             id='footnotes'
 628         )
 629
 630         etree.SubElement(
 631             d,
 632             'h2',
 633         ).text = 'Przypisy:'
 634
 635         d.extend(self.footnotes)
 636
 637         self.add_html(
 638             html.element,
 639             file_name='annotations.xhtml',
 640             spine=True,
 641             toc='Przypisy'
 642         )
 643
 644     def add_cover(self):
 645         # TODO: allow other covers
 646
 647         cover_maker = make_cover
 648
 649         cover_file = six.BytesIO()
 650         cover = cover_maker(self.document.meta)
 651         cover.save(cover_file)
 652         cover_name = 'cover.%s' % cover.ext()
 653
 654         self.output.set_cover(
 655             file_name=cover_name,
 656             content=cover_file.getvalue(),
 657             create_page = False
 658         )
 659         ci = ('''<?xml version="1.0" encoding="UTF-8"?>
 660 <!DOCTYPE html>
 661 <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
 662  <head>
 663   <title>Okładka</title>
 664   <style>
 665     body { margin: 0em; padding: 0em; }
 666     img { width: 100%%; }
 667   </style>
 668  </head>
 669  <body>
 670    <img src="cover.%s" alt="Okładka" />
 671  </body>
 672 </html>''' % cover.ext()).encode('utf-8')
 673         self.add_file(file_name='cover.xhtml', content=ci)
 674
 675         self.output.spine.append(('cover', 'no'))
 676         self.output.guide.append({
 677             'type': 'cover',
 678             'href': 'cover.xhtml',
 679             'title': 'Okładka'
 680         })
 681
 682     def mathml(self, element):
 683         name = "math%d.png" % self.assign_mathml_number()
 684         self.add_file(
 685             content=MathML(element).to_latex().to_png().data,
 686             media_type='image/png',
 687             file_name=name
 688         )
 689         return name