src/librarian/builders/epub.py

   1 from datetime import date
   2 import os
   3 import tempfile
   4 from ebooklib import epub
   5 from lxml import etree
   6 import six
   7 from librarian import functions, OutputFile, get_resource, XHTMLNS
   8 from librarian.cover import make_cover
   9 from librarian.embeds.mathml import MathML
  10 import librarian.epub
  11 from librarian.fonts import strip_font
  12
  13
  14 class Xhtml:
  15     def __init__(self):
  16         self.element = etree.XML('''<html xmlns="http://www.w3.org/1999/xhtml"><head><link rel="stylesheet" href="style.css" type="text/css"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>WolneLektury.pl</title></head><body/></html>''')
  17
  18     @property
  19     def title(self):
  20         return self.element.find('.//' + XHTMLNS('title'))
  21
  22     @property
  23     def body(self):
  24         return self.element.find('.//' + XHTMLNS('body'))
  25
  26
  27 class Builder:
  28     file_extension = None
  29
  30     def __init__(self, base_url=None, fundraising=None, cover=None):
  31         self._base_url = base_url or 'file:///home/rczajka/for/fnp/librarian/temp~/maly/img/'
  32         self.fundraising = fundraising
  33         self.footnotes = etree.Element('div', id='footnotes')
  34         self.make_cover = cover or make_cover
  35
  36         self.cursors = {
  37 #            None: None,
  38 #            'header': self.header,
  39             'footnotes': self.footnotes,
  40         }
  41         self.current_cursors = []
  42
  43         self.toc_base = 0
  44
  45     @property
  46     def cursor(self):
  47         return self.current_cursors[-1]
  48
  49     def enter_fragment(self, fragment):
  50         self.current_cursors.append(self.cursors[fragment])
  51
  52     def exit_fragment(self):
  53         self.current_cursors.pop()
  54
  55     def create_fragment(self, name, element):
  56         assert name not in self.cursors
  57         self.cursors[name] = element
  58
  59     def forget_fragment(self, name):
  60         del self.cursors[name]
  61
  62     @property
  63     def base_url(self):
  64         if self._base_url is not None:
  65             return self._base_url
  66         else:
  67             return 'https://wolnelektury.pl/media/book/pictures/{}/'.format(self.document.meta.url.slug)
  68
  69
  70     # Base URL should be on Document level, not builder.
  71     def build(self, document, **kwargs):
  72         """Should return an OutputFile with the output."""
  73         raise NotImplementedError()
  74
  75
  76 class EpubBuilder(Builder):
  77     file_extension = 'epub'
  78     isbn_field = 'isbn_epub'
  79     orphans = True
  80
  81     def __init__(self, *args, **kwargs):
  82         self.chars = set()
  83         self.fundr = 0
  84         super().__init__(*args, **kwargs)
  85
  86     def build(self, document, **kwargs):
  87         # replace_characters -- nie, robimy to na poziomie elementów
  88
  89         # hyphenator (\00ad w odp. miejscach) -- jeśli już, to też powinno to się dziać na poziomie elementów
  90         # spójniki (\u00a0 po)-- jeśli już, to na poziomie elementów
  91         # trick na dywizy: &#xad;&#8288;-
  92
  93         # do toc trafia:
  94         #   początek z KAŻDEGO PLIKU xml
  95
  96         # zliczamy zbiór użytych znaków
  97
  98         # flagi:
  99         # mieliśmy taką flagę less-advertising, używaną tylko dla Prestigio; już nie używamy.
 100
 101         # @editors = document.editors() (jako str)
 102         # @funders = join(meta.funders)
 103         # @thanks = meta.thanks
 104
 105
 106         self.output = output = epub.EpubBook()
 107         self.document = document
 108
 109         self.set_metadata()
 110
 111         self.add_cover()
 112
 113         self.add_title_page()
 114         self.add_toc()
 115
 116
 117
 118         self.start_chunk()
 119
 120         self.add_toc_entry(
 121             None,
 122             'Początek utworu', # i18n
 123             0
 124         )
 125         self.output.guide.append({
 126             "type": "text",
 127             "title": "Początek",
 128             "href": "part1.xhtml"
 129         })
 130
 131
 132         self.build_document(self.document)
 133
 134
 135         self.close_chunk()
 136
 137         self.add_annotations()
 138         self.add_support_page()
 139         self.add_last_page()
 140
 141         if self.fundraising:
 142             e = len(self.output.spine) - 3 - 3
 143             nfunds = len(self.fundraising)
 144             if e > 4 * nfunds:
 145                 nfunds *= 2
 146
 147             # COUNTING CHARACTERS?
 148             for f in range(nfunds):
 149                 spine_index = int(4 + (f / nfunds * e) + f)
 150
 151                 h = Xhtml()
 152                 h.body.append(
 153                     etree.XML('<div id="book-text"><div class="fundraising">' + self.fundraising[f % len(self.fundraising)] + '</div></div>')
 154                 )
 155                 self.add_html(h.element, file_name='fund%d.xhtml' % f, spine=spine_index)
 156
 157         self.add_fonts()
 158
 159         output_file = tempfile.NamedTemporaryFile(
 160             prefix='librarian', suffix='.epub',
 161             delete=False)
 162         output_file.close()
 163         epub.write_epub(output_file.name, output, {'epub3_landmark': False})
 164         return OutputFile.from_filename(output_file.name)
 165
 166     def build_document(self, document):
 167         self.toc_precedences = []
 168
 169         self.start_chunk()
 170
 171
 172         document.tree.getroot().epub_build(self)
 173         if document.meta.parts:
 174             self.start_chunk()
 175
 176             self.start_element('div', {'class': 'title-page'})
 177             self.start_element('h1', {'class': 'title'})
 178             self.push_text(document.meta.title)
 179             self.end_element()
 180             self.end_element()
 181
 182             ######
 183             # 160
 184             # translators
 185             # working copy?
 186             # ta lektura
 187             # tanks
 188             # utwor opracowany
 189             # isbn
 190             # logo
 191
 192             for child in document.children:
 193                 self.start_chunk()
 194                 self.add_toc_entry(None, child.meta.title, 0)
 195                 self.build_document(child)
 196
 197         self.shift_toc_base()
 198
 199
 200     def add_title_page(self):
 201         html = Xhtml()
 202         html.title.text = "Strona tytułowa"
 203         bt = etree.SubElement(html.body, 'div', **{'id': 'book-text'})
 204         tp = etree.SubElement(bt, 'div', **{'class': 'title-page'})
 205
 206         # Tak jak jest teraz – czy może być jednocześnie
 207         # no „autor_utworu”
 208         # i „dzieło nadrzędne”
 209         # wcześniej mogło być dzieło nadrzędne,
 210
 211         e = self.document.tree.find('//autor_utworu')
 212         if e is not None:
 213             etree.SubElement(tp, 'h2', **{'class': 'author'}).text = e.raw_printable_text(self)
 214         e = self.document.tree.find('//nazwa_utworu')
 215         if e is not None:
 216             etree.SubElement(tp, 'h1', **{'class': 'title'}).text = e.raw_printable_text(self)
 217
 218         if not len(tp):
 219             for author in self.document.meta.authors:
 220                 etree.SubElement(tp, 'h2', **{'class': 'author'}).text = author.readable()
 221             etree.SubElement(tp, 'h1', **{'class': 'title'}).text = self.document.meta.title
 222
 223 #                <xsl:apply-templates select="//nazwa_utworu | //podtytul | //dzielo_nadrzedne" mode="poczatek"/>
 224 #        else:
 225 #                            <xsl:apply-templates select="//dc:creator" mode="poczatek"/>
 226 #                <xsl:apply-templates select="//dc:title | //podtytul | //dzielo_nadrzedne" mode="poczatek"/>
 227
 228         etree.SubElement(tp, 'p', **{"class": "info"}).text = '\u00a0'
 229
 230         if self.document.meta.translators:
 231             p = etree.SubElement(tp, 'p', **{'class': 'info'})
 232             p.text = 'tłum. ' + ', '.join(t.readable() for t in self.document.meta.translators)
 233
 234         #<p class="info">[Kopia robocza]</p>
 235
 236         p = etree.XML("""<p class="info">
 237               <a>Ta lektura</a>, podobnie jak tysiące innych, jest dostępna on-line na stronie
 238               <a href="http://www.wolnelektury.pl/">wolnelektury.pl</a>.
 239             </p>""")
 240         p[0].attrib['href'] = str(self.document.meta.url)
 241         tp.append(p)
 242
 243         if self.document.meta.thanks:
 244             etree.SubElement(tp, 'p', **{'class': 'info'}).text = self.document.meta.thanks
 245
 246         tp.append(etree.XML("""
 247           <p class="info">
 248             Utwór opracowany został w&#160;ramach projektu<a href="http://www.wolnelektury.pl/"> Wolne Lektury</a> przez<a href="http://www.nowoczesnapolska.org.pl/"> fundację Nowoczesna Polska</a>.
 249           </p>
 250         """))
 251
 252         if getattr(self.document.meta, self.isbn_field):
 253             etree.SubElement(tp, 'p', **{"class": "info"}).text = getattr(self.document.meta, self.isbn_field)
 254
 255         tp.append(etree.XML("""<p class="footer info">
 256             <a href="http://www.wolnelektury.pl/"><img src="logo_wolnelektury.png" alt="WolneLektury.pl" /></a>
 257         </p>"""))
 258
 259         self.add_html(
 260             html.element,
 261             file_name='title.xhtml',
 262             spine=True,
 263             toc='Strona tytułowa' # TODO: i18n
 264         )
 265
 266         self.add_file(
 267             get_resource('res/wl-logo-small.png'),
 268             file_name='logo_wolnelektury.png',
 269             media_type='image/png'
 270         )
 271
 272     def set_metadata(self):
 273         self.output.set_identifier(
 274             str(self.document.meta.url))
 275         self.output.set_language(
 276             functions.lang_code_3to2(self.document.meta.language)
 277         )
 278         self.output.set_title(self.document.meta.title)
 279
 280         for i, author in enumerate(self.document.meta.authors):
 281             self.output.add_author(
 282                 author.readable(),
 283                 file_as=six.text_type(author),
 284                 uid='creator{}'.format(i)
 285             )
 286         for translator in self.document.meta.translators:
 287             self.output.add_author(
 288                 translator.readable(),
 289                 file_as=six.text_type(translator),
 290                 role='trl',
 291                 uid='translator{}'.format(i)
 292             )
 293         for publisher in self.document.meta.publisher:
 294             self.output.add_metadata("DC", "publisher", publisher)
 295
 296         self.output.add_metadata("DC", "date", self.document.meta.created_at)
 297
 298
 299
 300
 301     def add_toc(self):
 302         item = epub.EpubNav()
 303         item.add_link(href='style.css', rel='stylesheet', type='text/css')
 304         self.output.add_item(item)
 305         self.output.spine.append(item)
 306         self.output.add_item(epub.EpubNcx())
 307
 308         self.output.toc.append(
 309             epub.Link(
 310                 "nav.xhtml",
 311                 "Spis treści",
 312                 "nav"
 313             )
 314         )
 315
 316
 317
 318     def add_support_page(self):
 319         self.add_file(
 320             get_resource('epub/support.xhtml'),
 321             spine=True,
 322             toc='Wesprzyj Wolne Lektury'
 323         )
 324
 325         self.add_file(
 326             get_resource('res/jedenprocent.png'),
 327             media_type='image/png'
 328         )
 329         self.add_file(
 330             get_resource('epub/style.css'),
 331             media_type='text/css'
 332         )
 333
 334
 335     def add_file(self, path=None, content=None,
 336                  media_type='application/xhtml+xml',
 337                  file_name=None, uid=None,
 338                  spine=False, toc=None):
 339
 340         # update chars?
 341         # jakieś tam ścieśnianie białych znaków?
 342
 343         if content is None:
 344             with open(path, 'rb') as f:
 345                 content = f.read()
 346             if file_name is None:
 347                 file_name = path.rsplit('/', 1)[-1]
 348
 349         if uid is None:
 350             uid = file_name.split('.', 1)[0]
 351
 352         item = epub.EpubItem(
 353             uid=uid,
 354             file_name=file_name,
 355             media_type=media_type,
 356             content=content
 357         )
 358
 359         self.output.add_item(item)
 360         if spine:
 361             if spine is True:
 362                 self.output.spine.append(item)
 363             else:
 364                 self.output.spine.insert(spine, item)
 365
 366         if toc:
 367             self.output.toc.append(
 368                 epub.Link(
 369                     file_name,
 370                     toc,
 371                     uid
 372                 )
 373             )
 374
 375     def add_html(self, html_tree, **kwargs):
 376         html = etree.tostring(
 377             html_tree, pretty_print=True, xml_declaration=True,
 378             encoding="utf-8",
 379             doctype='<!DOCTYPE html>'
 380         )
 381
 382         html = librarian.epub.squeeze_whitespace(html)
 383
 384         self.add_file(
 385             content=html,
 386             **kwargs
 387         )
 388
 389
 390     def add_fonts(self):
 391         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
 392                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
 393             self.add_file(
 394                 content=strip_font(
 395                     get_resource('fonts/' + fname),
 396                     self.chars
 397                 ),
 398                 file_name=fname,
 399                 media_type='font/ttf'
 400             )
 401
 402     def start_chunk(self):
 403         if getattr(self, 'current_chunk', None) is not None:
 404             if not len(self.current_chunk):
 405                 return
 406             self.close_chunk()
 407         self.current_chunk = etree.Element(
 408             'div',
 409             id="book-text"
 410         )
 411         self.cursors[None] = self.current_chunk
 412         self.current_cursors.append(self.current_chunk)
 413
 414         self.section_number = 0
 415
 416
 417     def close_chunk(self):
 418         assert self.cursor is self.current_chunk
 419         ###### -- what if we're inside?
 420
 421         chunk_no = getattr(
 422             self,
 423             'chunk_counter',
 424             1
 425         )
 426         self.chunk_counter = chunk_no + 1
 427
 428         html = Xhtml()
 429         html.body.append(self.current_chunk)
 430
 431         self.add_html(
 432             ## html container from template.
 433             #self.current_chunk,
 434             html.element,
 435             file_name='part%d.xhtml' % chunk_no,
 436             spine=True,
 437
 438         )
 439         self.current_chunk = None
 440         self.current_cursors.pop()
 441
 442     def start_element(self, tag, attr):
 443         self.current_cursors.append(
 444             etree.SubElement(self.cursor, tag, **attr)
 445         )
 446
 447     def end_element(self):
 448         self.current_cursors.pop()
 449
 450     def push_text(self, text):
 451         self.chars.update(text)
 452         if len(self.cursor):
 453             self.cursor[-1].tail = (self.cursor[-1].tail or '') + text
 454         else:
 455             self.cursor.text = (self.cursor.text or '') + text
 456
 457
 458     def assign_image_number(self):
 459         image_number = getattr(self, 'image_number', 0)
 460         self.image_number = image_number + 1
 461         return image_number
 462
 463     def assign_footnote_number(self):
 464         number = getattr(self, 'footnote_number', 1)
 465         self.footnote_number = number + 1
 466         return number
 467
 468     def assign_section_number(self):
 469         number = getattr(self, 'section_number', 1)
 470         self.section_number = number + 1
 471         return number
 472
 473     def assign_mathml_number(self):
 474         number = getattr(self, 'mathml_number', 0)
 475         self.mathml_number = number + 1
 476         return number
 477
 478
 479     def add_toc_entry(self, fragment, name, precedence):
 480         if precedence:
 481             while self.toc_precedences and self.toc_precedences[-1] >= precedence:
 482                 self.toc_precedences.pop()
 483         else:
 484             self.toc_precedences = []
 485
 486         real_level = self.toc_base + len(self.toc_precedences)
 487         if precedence:
 488             self.toc_precedences.append(precedence)
 489         else:
 490             self.toc_base += 1
 491
 492         part_number = getattr(
 493             self,
 494             'chunk_counter',
 495             1
 496         )
 497         filename = 'part%d.xhtml' % part_number
 498         uid = filename.split('.')[0]
 499         if fragment:
 500             filename += '#' + fragment
 501             uid += '-' + fragment
 502
 503         toc = self.output.toc
 504         for l in range(1, real_level):
 505             if isinstance(toc[-1], epub.Link):
 506                 toc[-1] = [toc[-1], []]
 507             toc = toc[-1][1]
 508
 509         toc.append(
 510             epub.Link(
 511                 filename,
 512                 name,
 513                 uid
 514             )
 515         )
 516
 517     def shift_toc_base(self):
 518         self.toc_base -= 1
 519
 520
 521     def add_last_page(self):
 522         html = Xhtml()
 523         m = self.document.meta
 524
 525         html.title.text = 'Strona redakcyjna'
 526         d = etree.SubElement(html.body, 'div', id='book-text')
 527
 528         newp = lambda: etree.SubElement(d, 'p', {'class': 'info'})
 529
 530         p = newp()
 531         p.text = (
 532             "Wszystkie zasoby Wolnych Lektur możesz swobodnie wykorzystywać, "
 533             "publikować i rozpowszechniać pod warunkiem zachowania warunków "
 534             "licencji i zgodnie z "
 535         )
 536         a = etree.SubElement(p, "a", href="https://wolnelektury.pl/info/zasady-wykorzystania/")
 537         a.text = "Zasadami wykorzystania Wolnych Lektur"
 538         a.tail = "."
 539
 540         etree.SubElement(p, "br")
 541
 542
 543         if m.license:
 544             p[-1].tail = "Ten utwór jest udostępniony na licencji "
 545             etree.SubElement(p, 'a', href=m.license).text = m.license_description
 546         else:
 547             p[-1].tail = 'Ten utwór jest w domenie publicznej.'
 548
 549         etree.SubElement(p, "br")
 550
 551         p[-1].tail = (
 552             "Wszystkie materiały dodatkowe (przypisy, motywy literackie) są "
 553             "udostępnione na "
 554             )
 555         etree.SubElement(p, 'a', href='https://artlibre.org/licence/lal/pl/').text = 'Licencji Wolnej Sztuki 1.3'
 556         p[-1].tail = '.'
 557         etree.SubElement(p, "br")
 558         p[-1].tail = (
 559             "Fundacja Nowoczesna Polska zastrzega sobie prawa do wydania "
 560             "krytycznego zgodnie z art. Art.99(2) Ustawy o prawach autorskich "
 561             "i prawach pokrewnych. Wykorzystując zasoby z Wolnych Lektur, "
 562             "należy pamiętać o zapisach licencji oraz zasadach, które "
 563             "spisaliśmy w "
 564         )
 565
 566         etree.SubElement(p, 'a', href='https://wolnelektury.pl/info/zasady-wykorzystania/').text = 'Zasadach wykorzystania Wolnych Lektur'
 567         p[-1].tail = '. Zapoznaj się z nimi, zanim udostępnisz dalej nasze książki.'
 568
 569         p = newp()
 570         p.text = 'E-book można pobrać ze strony: '
 571         etree.SubElement(
 572             p, 'a', href=str(m.url),
 573             title=', '.join((
 574                 ', '.join(p.readable() for p in m.authors),
 575                 m.title
 576             ))
 577         ).text = str(m.url)
 578
 579         if m.source_name:
 580             newp().text = 'Tekst opracowany na podstawie: ' + m.source_name
 581
 582         newp().text = """
 583               Wydawca:
 584               """ + ", ".join(p for p in m.publisher)
 585
 586         if m.description:
 587             newp().text = m.description
 588
 589
 590         if m.editors:
 591             newp().text = 'Opracowanie redakcyjne i przypisy: %s.' % (
 592                 ', '.join(e.readable() for e in sorted(self.document.editors())))
 593
 594         if m.funders:
 595             etree.SubElement(d, 'p', {'class': 'minor-info'}).text = '''Publikację wsparli i wsparły:
 596             %s.''' % (', '.join(m.funders))
 597
 598         if m.cover_by:
 599             p = newp()
 600             p.text = 'Okładka na podstawie: '
 601             if m.cover_source:
 602                 etree.SubElement(
 603                     p,
 604                     'a',
 605                     href=m.cover_source
 606                 ).text = m.cover_by
 607             else:
 608                 p.text += m.cover_by
 609
 610         if getattr(m, self.isbn_field):
 611             newp().text = getattr(m, self.isbn_field)
 612
 613         newp().text = '\u00a0'
 614
 615         p = newp()
 616         p.attrib['class'] = 'minor-info'
 617         p.text = '''
 618               Plik wygenerowany dnia '''
 619         span = etree.SubElement(p, 'span', id='file_date')
 620         span.text = str(date.today())
 621         span.tail = '''.
 622           '''
 623
 624         self.add_html(
 625             html.element,
 626             file_name='last.xhtml',
 627             toc='Strona redakcyjna',
 628             spine=True
 629         )
 630
 631
 632     def add_annotations(self):
 633         if not len(self.footnotes):
 634             return
 635
 636         html = Xhtml()
 637         html.title.text = 'Przypisy'
 638         d = etree.SubElement(
 639             etree.SubElement(
 640                 html.body,
 641                 'div',
 642                 id='book-text'
 643             ),
 644             'div',
 645             id='footnotes'
 646         )
 647
 648         etree.SubElement(
 649             d,
 650             'h2',
 651         ).text = 'Przypisy:'
 652
 653         d.extend(self.footnotes)
 654
 655         self.add_html(
 656             html.element,
 657             file_name='annotations.xhtml',
 658             spine=True,
 659             toc='Przypisy'
 660         )
 661
 662     def add_cover(self):
 663         # TODO: allow other covers
 664
 665         cover_maker = self.make_cover
 666
 667         cover_file = six.BytesIO()
 668         cover = cover_maker(self.document.meta, width=600)
 669         cover.save(cover_file)
 670         cover_name = 'cover.%s' % cover.ext()
 671
 672         self.output.set_cover(
 673             file_name=cover_name,
 674             content=cover_file.getvalue(),
 675             create_page = False
 676         )
 677         ci = ('''<?xml version="1.0" encoding="UTF-8"?>
 678 <!DOCTYPE html>
 679 <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
 680  <head>
 681   <title>Okładka</title>
 682   <style>
 683     body { margin: 0em; padding: 0em; }
 684     img { width: 100%%; }
 685   </style>
 686  </head>
 687  <body>
 688    <img src="cover.%s" alt="Okładka" />
 689  </body>
 690 </html>''' % cover.ext()).encode('utf-8')
 691         self.add_file(file_name='cover.xhtml', content=ci)
 692
 693         self.output.spine.append(('cover', 'no'))
 694         self.output.guide.append({
 695             'type': 'cover',
 696             'href': 'cover.xhtml',
 697             'title': 'Okładka'
 698         })
 699
 700     def mathml(self, element):
 701         name = "math%d.png" % self.assign_mathml_number()
 702         self.add_file(
 703             content=MathML(element).to_latex().to_png().data,
 704             media_type='image/png',
 705             file_name=name
 706         )
 707         return name