src/librarian/builders/epub.py

   1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
   3 #
   4 from datetime import date
   5 import io
   6 import os
   7 import re
   8 import tempfile
   9 from ebooklib import epub
  10 from lxml import etree
  11 from librarian import functions, OutputFile, get_resource, XHTMLNS
  12 from librarian.cover import make_cover
  13 from librarian.embeds.mathml import MathML
  14 from librarian.fonts import strip_font
  15
  16
  17 class Xhtml:
  18     def __init__(self):
  19         self.element = etree.XML('''<html xmlns="http://www.w3.org/1999/xhtml"><head><link rel="stylesheet" href="style.css" type="text/css"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>WolneLektury.pl</title></head><body/></html>''')
  20
  21     @property
  22     def title(self):
  23         return self.element.find('.//' + XHTMLNS('title'))
  24
  25     @property
  26     def body(self):
  27         return self.element.find('.//' + XHTMLNS('body'))
  28
  29
  30 class Builder:
  31     file_extension = None
  32
  33     def __init__(self, base_url=None, fundraising=None, cover=None):
  34         self._base_url = base_url or 'file:///home/rczajka/for/fnp/librarian/temp~/maly/img/'
  35         self.fundraising = fundraising
  36         self.footnotes = etree.Element('div', id='footnotes')
  37         self.make_cover = cover or make_cover
  38
  39         self.cursors = {
  40 #            None: None,
  41 #            'header': self.header,
  42             'footnotes': self.footnotes,
  43         }
  44         self.current_cursors = []
  45
  46         self.toc_base = 0
  47
  48     @property
  49     def cursor(self):
  50         return self.current_cursors[-1]
  51
  52     def enter_fragment(self, fragment):
  53         self.current_cursors.append(self.cursors[fragment])
  54
  55     def exit_fragment(self):
  56         self.current_cursors.pop()
  57
  58     def create_fragment(self, name, element):
  59         assert name not in self.cursors
  60         self.cursors[name] = element
  61
  62     def forget_fragment(self, name):
  63         del self.cursors[name]
  64
  65     @property
  66     def base_url(self):
  67         if self._base_url is not None:
  68             return self._base_url
  69         else:
  70             return 'https://wolnelektury.pl/media/book/pictures/{}/'.format(self.document.meta.url.slug)
  71
  72
  73     # Base URL should be on Document level, not builder.
  74     def build(self, document, **kwargs):
  75         """Should return an OutputFile with the output."""
  76         raise NotImplementedError()
  77
  78
  79 class EpubBuilder(Builder):
  80     file_extension = 'epub'
  81     isbn_field = 'isbn_epub'
  82     orphans = True
  83
  84     def __init__(self, *args, debug=False, **kwargs):
  85         self.numbering = 0
  86         self.chars = set()
  87         self.fundr = 0
  88         self.debug = debug
  89         self.splits = []
  90         super().__init__(*args, **kwargs)
  91
  92     def build(self, document, **kwargs):
  93         # replace_characters -- nie, robimy to na poziomie elementów
  94
  95         # hyphenator (\00ad w odp. miejscach) -- jeśli już, to też powinno to się dziać na poziomie elementów
  96         # spójniki (\u00a0 po)-- jeśli już, to na poziomie elementów
  97         # trick na dywizy: &#xad;&#8288;-
  98
  99         # do toc trafia:
 100         #   początek z KAŻDEGO PLIKU xml
 101
 102         # zliczamy zbiór użytych znaków
 103
 104         # flagi:
 105         # mieliśmy taką flagę less-advertising, używaną tylko dla Prestigio; już nie używamy.
 106
 107         # @editors = document.editors() (jako str)
 108         # @funders = join(meta.funders)
 109         # @thanks = meta.thanks
 110
 111
 112         self.output = output = epub.EpubBook()
 113         self.document = document
 114
 115         self.set_metadata()
 116
 117         self.add_cover()
 118
 119         self.add_title_page()
 120         self.add_toc()
 121
 122
 123
 124         self.start_chunk()
 125
 126         self.add_toc_entry(
 127             None,
 128             'Początek utworu', # i18n
 129             0
 130         )
 131         self.output.guide.append({
 132             "type": "text",
 133             "title": "Początek",
 134             "href": "part1.xhtml"
 135         })
 136
 137
 138         self.build_document(self.document)
 139
 140
 141         self.close_chunk()
 142
 143         self.add_annotations()
 144         self.add_support_page()
 145         self.add_last_page()
 146
 147         if self.fundraising:
 148             e = len(self.output.spine) - 3 - 3
 149             nfunds = len(self.fundraising)
 150             if e > 4 * nfunds:
 151                 nfunds *= 2
 152
 153             # COUNTING CHARACTERS?
 154             for f in range(nfunds):
 155                 spine_index = int(4 + (f / nfunds * e) + f)
 156
 157                 h = Xhtml()
 158                 h.body.append(
 159                     etree.XML('<div id="book-text"><div class="fundraising">' + self.fundraising[f % len(self.fundraising)] + '</div></div>')
 160                 )
 161                 self.add_html(h.element, file_name='fund%d.xhtml' % f, spine=spine_index)
 162
 163         self.add_fonts()
 164
 165         output_file = tempfile.NamedTemporaryFile(
 166             prefix='librarian', suffix='.epub',
 167             delete=False)
 168         output_file.close()
 169         epub.write_epub(output_file.name, output, {'epub3_landmark': False})
 170         return OutputFile.from_filename(output_file.name)
 171
 172     def build_document(self, document):
 173         self.toc_precedences = []
 174
 175         self.start_chunk()
 176
 177
 178         document.tree.getroot().epub_build(self)
 179         if document.meta.parts:
 180             self.start_chunk()
 181
 182             self.start_element('div', {'class': 'title-page'})
 183             self.start_element('h1', {'class': 'title'})
 184             self.push_text(document.meta.title)
 185             self.end_element()
 186             self.end_element()
 187
 188             ######
 189             # 160
 190             # translators
 191             # working copy?
 192             # ta lektura
 193             # tanks
 194             # utwor opracowany
 195             # isbn
 196             # logo
 197
 198             for child in document.children:
 199                 self.start_chunk()
 200                 self.add_toc_entry(None, child.meta.title, 0)
 201                 self.build_document(child)
 202
 203         self.shift_toc_base()
 204
 205
 206     def add_title_page(self):
 207         html = Xhtml()
 208         html.title.text = "Strona tytułowa"
 209         bt = etree.SubElement(html.body, 'div', **{'id': 'book-text'})
 210         tp = etree.SubElement(bt, 'div', **{'class': 'title-page'})
 211
 212         # Tak jak jest teraz – czy może być jednocześnie
 213         # no „autor_utworu”
 214         # i „dzieło nadrzędne”
 215         # wcześniej mogło być dzieło nadrzędne,
 216
 217         e = self.document.tree.find('//autor_utworu')
 218         if e is not None:
 219             etree.SubElement(tp, 'h2', **{'class': 'author'}).text = e.raw_printable_text(self)
 220         e = self.document.tree.find('//nazwa_utworu')
 221         if e is not None:
 222             etree.SubElement(tp, 'h1', **{'class': 'title'}).text = e.raw_printable_text(self)
 223
 224         if not len(tp):
 225             for author in self.document.meta.authors:
 226                 etree.SubElement(tp, 'h2', **{'class': 'author'}).text = author.readable()
 227             etree.SubElement(tp, 'h1', **{'class': 'title'}).text = self.document.meta.title
 228
 229 #                <xsl:apply-templates select="//nazwa_utworu | //podtytul | //dzielo_nadrzedne" mode="poczatek"/>
 230 #        else:
 231 #                            <xsl:apply-templates select="//dc:creator" mode="poczatek"/>
 232 #                <xsl:apply-templates select="//dc:title | //podtytul | //dzielo_nadrzedne" mode="poczatek"/>
 233
 234         etree.SubElement(tp, 'p', **{"class": "info"}).text = '\u00a0'
 235
 236         if self.document.meta.translators:
 237             p = etree.SubElement(tp, 'p', **{'class': 'info'})
 238             p.text = 'tłum. ' + ', '.join(t.readable() for t in self.document.meta.translators)
 239
 240         #<p class="info">[Kopia robocza]</p>
 241
 242         p = etree.XML("""<p class="info">
 243               <a>Ta lektura</a>, podobnie jak tysiące innych, jest dostępna on-line na stronie
 244               <a href="https://wolnelektury.pl/">wolnelektury.pl</a>.
 245             </p>""")
 246         p[0].attrib['href'] = str(self.document.meta.url)
 247         tp.append(p)
 248
 249         if self.document.meta.thanks:
 250             etree.SubElement(tp, 'p', **{'class': 'info'}).text = self.document.meta.thanks
 251
 252         tp.append(etree.XML("""
 253           <p class="info">
 254             Utwór opracowany został w&#160;ramach projektu<a href="https://wolnelektury.pl/"> Wolne Lektury</a> przez<a href="https://fundacja.wolnelektury.pl/"> fundację Wolne Lektury</a>.
 255           </p>
 256         """))
 257
 258         if getattr(self.document.meta, self.isbn_field):
 259             etree.SubElement(tp, 'p', **{"class": "info"}).text = getattr(self.document.meta, self.isbn_field)
 260
 261         tp.append(etree.XML("""<p class="footer info">
 262             <a href="https://wolnelektury.pl/"><img src="logo_wolnelektury.png" alt="WolneLektury.pl" /></a>
 263         </p>"""))
 264
 265         self.add_html(
 266             html.element,
 267             file_name='title.xhtml',
 268             spine=True,
 269             toc='Strona tytułowa' # TODO: i18n
 270         )
 271
 272         self.add_file(
 273             get_resource('res/wl-logo-small.png'),
 274             file_name='logo_wolnelektury.png',
 275             media_type='image/png'
 276         )
 277
 278     def set_metadata(self):
 279         self.output.set_identifier(
 280             str(self.document.meta.url))
 281         self.output.set_language(
 282             functions.lang_code_3to2(self.document.meta.language)
 283         )
 284         self.output.set_title(self.document.meta.title)
 285
 286         for i, author in enumerate(self.document.meta.authors):
 287             self.output.add_author(
 288                 author.readable(),
 289                 file_as=str(author),
 290                 uid='creator{}'.format(i)
 291             )
 292         for i, translator in enumerate(self.document.meta.translators):
 293             self.output.add_author(
 294                 translator.readable(),
 295                 file_as=str(translator),
 296                 role='trl',
 297                 uid='translator{}'.format(i)
 298             )
 299         for publisher in self.document.meta.publisher:
 300             self.output.add_metadata("DC", "publisher", publisher)
 301
 302         self.output.add_metadata("DC", "date", self.document.meta.created_at)
 303
 304
 305
 306
 307     def add_toc(self):
 308         item = epub.EpubNav()
 309         item.add_link(href='style.css', rel='stylesheet', type='text/css')
 310         self.output.add_item(item)
 311         self.output.spine.append(item)
 312         self.output.add_item(epub.EpubNcx())
 313
 314         self.output.toc.append(
 315             epub.Link(
 316                 "nav.xhtml",
 317                 "Spis treści",
 318                 "nav"
 319             )
 320         )
 321
 322
 323
 324     def add_support_page(self):
 325         self.add_file(
 326             get_resource('res/epub/support.xhtml'),
 327             spine=True,
 328             toc='Wesprzyj Wolne Lektury'
 329         )
 330
 331         self.add_file(
 332             get_resource('res/jedenprocent.png'),
 333             media_type='image/png'
 334         )
 335         self.add_file(
 336             get_resource('res/epub/style.css'),
 337             media_type='text/css'
 338         )
 339
 340
 341     def add_file(self, path=None, content=None,
 342                  media_type='application/xhtml+xml',
 343                  file_name=None, uid=None,
 344                  spine=False, toc=None):
 345
 346         # update chars?
 347         # jakieś tam ścieśnianie białych znaków?
 348
 349         if content is None:
 350             with open(path, 'rb') as f:
 351                 content = f.read()
 352             if file_name is None:
 353                 file_name = path.rsplit('/', 1)[-1]
 354
 355         if uid is None:
 356             uid = file_name.split('.', 1)[0]
 357
 358         item = epub.EpubItem(
 359             uid=uid,
 360             file_name=file_name,
 361             media_type=media_type,
 362             content=content
 363         )
 364
 365         self.output.add_item(item)
 366         if spine:
 367             if spine is True:
 368                 self.output.spine.append(item)
 369             else:
 370                 self.output.spine.insert(spine, item)
 371
 372         if toc:
 373             self.output.toc.append(
 374                 epub.Link(
 375                     file_name,
 376                     toc,
 377                     uid
 378                 )
 379             )
 380
 381     def add_html(self, html_tree, **kwargs):
 382         html = etree.tostring(
 383             html_tree, pretty_print=True, xml_declaration=True,
 384             encoding="utf-8",
 385             doctype='<!DOCTYPE html>'
 386         )
 387
 388         self.add_file(
 389             content=html,
 390             **kwargs
 391         )
 392
 393
 394     def add_fonts(self):
 395         for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
 396                       'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
 397             self.add_file(
 398                 content=strip_font(
 399                     get_resource('fonts/' + fname),
 400                     self.chars
 401                 ),
 402                 file_name=fname,
 403                 media_type='font/ttf'
 404             )
 405
 406     def start_chunk(self):
 407         if getattr(self, 'current_chunk', None) is not None:
 408             if not len(self.current_chunk):
 409                 return
 410             self.close_chunk()
 411         self.current_chunk = etree.Element(
 412             'div',
 413             id="book-text"
 414         )
 415         self.cursors[None] = self.current_chunk
 416         self.current_cursors.append(self.current_chunk)
 417
 418         self.section_number = 0
 419
 420
 421     def close_chunk(self):
 422         assert self.cursor is self.current_chunk
 423         ###### -- what if we're inside?
 424
 425         chunk_no = getattr(
 426             self,
 427             'chunk_counter',
 428             1
 429         )
 430         self.chunk_counter = chunk_no + 1
 431
 432         html = Xhtml()
 433         html.body.append(self.current_chunk)
 434
 435         self.add_html(
 436             ## html container from template.
 437             #self.current_chunk,
 438             html.element,
 439             file_name='part%d.xhtml' % chunk_no,
 440             spine=True,
 441
 442         )
 443         self.current_chunk = None
 444         self.current_cursors.pop()
 445
 446     def start_element(self, tag, attr):
 447         self.current_cursors.append(
 448             etree.SubElement(self.cursor, tag, **attr)
 449         )
 450
 451     def end_element(self):
 452         self.current_cursors.pop()
 453
 454     def push_text(self, text):
 455         self.chars.update(text)
 456         if len(self.cursor):
 457             self.cursor[-1].tail = (self.cursor[-1].tail or '') + text
 458         else:
 459             self.cursor.text = (self.cursor.text or '') + text
 460
 461
 462     def assign_image_number(self):
 463         image_number = getattr(self, 'image_number', 0)
 464         self.image_number = image_number + 1
 465         return image_number
 466
 467     def assign_footnote_number(self):
 468         number = getattr(self, 'footnote_number', 1)
 469         self.footnote_number = number + 1
 470         return number
 471
 472     def assign_section_number(self):
 473         number = getattr(self, 'section_number', 1)
 474         self.section_number = number + 1
 475         return number
 476
 477     def assign_mathml_number(self):
 478         number = getattr(self, 'mathml_number', 0)
 479         self.mathml_number = number + 1
 480         return number
 481
 482
 483     def add_toc_entry(self, fragment, name, precedence):
 484         if precedence:
 485             while self.toc_precedences and self.toc_precedences[-1] >= precedence:
 486                 self.toc_precedences.pop()
 487         else:
 488             self.toc_precedences = []
 489
 490         real_level = self.toc_base + len(self.toc_precedences)
 491         if precedence:
 492             self.toc_precedences.append(precedence)
 493         else:
 494             self.toc_base += 1
 495
 496         part_number = getattr(
 497             self,
 498             'chunk_counter',
 499             1
 500         )
 501         filename = 'part%d.xhtml' % part_number
 502         uid = filename.split('.')[0]
 503         if fragment:
 504             filename += '#' + fragment
 505             uid += '-' + fragment
 506
 507         toc = self.output.toc
 508         for l in range(1, real_level):
 509             if isinstance(toc[-1], epub.Link):
 510                 toc[-1] = [toc[-1], []]
 511             toc = toc[-1][1]
 512
 513         toc.append(
 514             epub.Link(
 515                 filename,
 516                 name,
 517                 uid
 518             )
 519         )
 520
 521     def shift_toc_base(self):
 522         self.toc_base -= 1
 523
 524
 525     def add_last_page(self):
 526         html = Xhtml()
 527         m = self.document.meta
 528
 529         html.title.text = 'Strona redakcyjna'
 530         d = etree.SubElement(html.body, 'div', id='book-text')
 531
 532         newp = lambda: etree.SubElement(d, 'p', {'class': 'info'})
 533
 534         p = newp()
 535         p.text = (
 536             "Wszystkie zasoby Wolnych Lektur możesz swobodnie wykorzystywać, "
 537             "publikować i rozpowszechniać pod warunkiem zachowania warunków "
 538             "licencji i zgodnie z "
 539         )
 540         a = etree.SubElement(p, "a", href="https://wolnelektury.pl/info/zasady-wykorzystania/")
 541         a.text = "Zasadami wykorzystania Wolnych Lektur"
 542         a.tail = "."
 543
 544         etree.SubElement(p, "br")
 545
 546
 547         if m.license:
 548             p[-1].tail = "Ten utwór jest udostępniony na licencji "
 549             etree.SubElement(p, 'a', href=m.license).text = m.license_description
 550         else:
 551             p[-1].tail = 'Ten utwór jest w domenie publicznej.'
 552
 553         etree.SubElement(p, "br")
 554
 555         p[-1].tail = (
 556             "Wszystkie materiały dodatkowe (przypisy, motywy literackie) są "
 557             "udostępnione na "
 558             )
 559         etree.SubElement(p, 'a', href='https://artlibre.org/licence/lal/pl/').text = 'Licencji Wolnej Sztuki 1.3'
 560         p[-1].tail = '.'
 561         etree.SubElement(p, "br")
 562         p[-1].tail = (
 563             "Fundacja Wolne Lektury zastrzega sobie prawa do wydania "
 564             "krytycznego zgodnie z art. Art.99(2) Ustawy o prawach autorskich "
 565             "i prawach pokrewnych. Wykorzystując zasoby z Wolnych Lektur, "
 566             "należy pamiętać o zapisach licencji oraz zasadach, które "
 567             "spisaliśmy w "
 568         )
 569
 570         etree.SubElement(p, 'a', href='https://wolnelektury.pl/info/zasady-wykorzystania/').text = 'Zasadach wykorzystania Wolnych Lektur'
 571         p[-1].tail = '. Zapoznaj się z nimi, zanim udostępnisz dalej nasze książki.'
 572
 573         p = newp()
 574         p.text = 'E-book można pobrać ze strony: '
 575         etree.SubElement(
 576             p, 'a', href=str(m.url),
 577             title=', '.join((
 578                 ', '.join(p.readable() for p in m.authors),
 579                 m.title
 580             ))
 581         ).text = str(m.url)
 582
 583         if m.source_name:
 584             newp().text = 'Tekst opracowany na podstawie: ' + m.source_name
 585
 586         newp().text = """
 587               Wydawca:
 588               """ + ", ".join(p for p in m.publisher)
 589
 590         if m.description:
 591             newp().text = m.description
 592
 593
 594         editors = self.document.editors()
 595         if editors:
 596             newp().text = 'Opracowanie redakcyjne i przypisy: %s.' % (
 597                 ', '.join(e.readable() for e in sorted(editors))
 598             )
 599
 600         if m.funders:
 601             etree.SubElement(d, 'p', {'class': 'minor-info'}).text = '''Publikację wsparli i wsparły:
 602             %s.''' % (', '.join(m.funders))
 603
 604         if m.cover_by:
 605             p = newp()
 606             p.text = 'Okładka na podstawie: '
 607             if m.cover_source:
 608                 etree.SubElement(
 609                     p,
 610                     'a',
 611                     href=m.cover_source
 612                 ).text = m.cover_by
 613             else:
 614                 p.text += m.cover_by
 615
 616         if getattr(m, self.isbn_field):
 617             newp().text = getattr(m, self.isbn_field)
 618
 619         newp().text = '\u00a0'
 620
 621         p = newp()
 622         p.attrib['class'] = 'minor-info'
 623         p.text = '''
 624               Plik wygenerowany dnia '''
 625         span = etree.SubElement(p, 'span', id='file_date')
 626         span.text = str(date.today())
 627         span.tail = '''.
 628           '''
 629
 630         self.add_html(
 631             html.element,
 632             file_name='last.xhtml',
 633             toc='Strona redakcyjna',
 634             spine=True
 635         )
 636
 637
 638     def add_annotations(self):
 639         if not len(self.footnotes):
 640             return
 641
 642         html = Xhtml()
 643         html.title.text = 'Przypisy'
 644         d = etree.SubElement(
 645             etree.SubElement(
 646                 html.body,
 647                 'div',
 648                 id='book-text'
 649             ),
 650             'div',
 651             id='footnotes'
 652         )
 653
 654         etree.SubElement(
 655             d,
 656             'h2',
 657         ).text = 'Przypisy:'
 658
 659         d.extend(self.footnotes)
 660
 661         self.add_html(
 662             html.element,
 663             file_name='annotations.xhtml',
 664             spine=True,
 665             toc='Przypisy'
 666         )
 667
 668     def add_cover(self):
 669         # TODO: allow other covers
 670
 671         cover_maker = self.make_cover
 672
 673         cover_file = io.BytesIO()
 674         cover = cover_maker(self.document.meta, width=600)
 675         cover.save(cover_file)
 676         cover_name = 'cover.%s' % cover.ext()
 677
 678         self.output.set_cover(
 679             file_name=cover_name,
 680             content=cover_file.getvalue(),
 681             create_page = False
 682         )
 683         ci = ('''<?xml version="1.0" encoding="UTF-8"?>
 684 <!DOCTYPE html>
 685 <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
 686  <head>
 687   <title>Okładka</title>
 688   <style>
 689     body { margin: 0em; padding: 0em; }
 690     img { width: 100%%; }
 691   </style>
 692  </head>
 693  <body>
 694    <img src="cover.%s" alt="Okładka" />
 695  </body>
 696 </html>''' % cover.ext()).encode('utf-8')
 697         self.add_file(file_name='cover.xhtml', content=ci)
 698
 699         self.output.spine.append(('cover', 'no'))
 700         self.output.guide.append({
 701             'type': 'cover',
 702             'href': 'cover.xhtml',
 703             'title': 'Okładka'
 704         })
 705
 706     def mathml(self, element):
 707         name = "math%d.png" % self.assign_mathml_number()
 708         self.add_file(
 709             content=MathML(element).to_latex().to_png().data,
 710             media_type='image/png',
 711             file_name=name
 712         )
 713         return name
 714
 715     def process_comment(self, comment):
 716         m = re.match(r'TRIM:(\d+)', comment.text)
 717         if m is not None:
 718             self.splits.append(comment.sourceline - int(m.group(1)))