X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/8df0bad21a18b56933084288320fa35e2a3fce6f..db91f942ce46e3af1420f3469a83257ef5aca4c2:/src/librarian/builders/epub.py diff --git a/src/librarian/builders/epub.py b/src/librarian/builders/epub.py new file mode 100644 index 0000000..91405c3 --- /dev/null +++ b/src/librarian/builders/epub.py @@ -0,0 +1,690 @@ +from datetime import date +import os +import tempfile +from ebooklib import epub +from lxml import etree +import six +from librarian import functions, OutputFile, get_resource, XHTMLNS +from librarian.cover import make_cover +from librarian.embeds.mathml import MathML +import librarian.epub +from librarian.fonts import strip_font +from librarian.fundraising import FUNDRAISING + + + + +class Xhtml: + def __init__(self): + self.element = etree.XML('''WolneLektury.pl''') + + @property + def title(self): + return self.element.find('.//' + XHTMLNS('title')) + + @property + def body(self): + return self.element.find('.//' + XHTMLNS('body')) + + +class Builder: + file_extension = None + + def __init__(self, base_url=None): + self._base_url = base_url or 'file:///home/rczajka/for/fnp/librarian/temp~/maly/img/' + self.footnotes = etree.Element('div', id='footnotes') + + self.cursors = { +# None: None, +# 'header': self.header, + 'footnotes': self.footnotes, + } + self.current_cursors = [] + + self.toc_base = 0 + + @property + def cursor(self): + return self.current_cursors[-1] + + def enter_fragment(self, fragment): + self.current_cursors.append(self.cursors[fragment]) + + def exit_fragment(self): + self.current_cursors.pop() + + def create_fragment(self, name, element): + assert name not in self.cursors + self.cursors[name] = element + + def forget_fragment(self, name): + del self.cursors[name] + + + + @property + def base_url(self): + if self._base_url is not None: + return self._base_url + else: + return 'https://wolnelektury.pl/media/book/pictures/{}/'.format(self.document.meta.url.slug) + + + # Base URL should be on Document level, not builder. + def build(self, document, **kwargs): + """Should return an OutputFile with the output.""" + raise NotImplementedError() + + +class EpubBuilder(Builder): + file_extension = 'epub' + + def __init__(self, *args, **kwargs): + self.chars = set() + self.fundr = 0 + super().__init__(*args, **kwargs) + + def build(self, document, **kwargs): + # replace_characters -- nie, robimy to na poziomie elementów + + # hyphenator (\00ad w odp. miejscach) -- jeśli już, to też powinno to się dziać na poziomie elementów + # spójniki (\u00a0 po)-- jeśli już, to na poziomie elementów + # trick na dywizy: ­⁠- + + # do toc trafia: + # początek z KAÅ»DEGO PLIKU xml + + # zliczamy zbiór użytych znaków + + # flagi: + # mieliśmy taką flagę less-advertising, używaną tylko dla Prestigio; już nie używamy. + + # @editors = document.editors() (jako str) + # @funders = join(meta.funders) + # @thanks = meta.thanks + + + self.output = output = epub.EpubBook() + self.document = document + + self.set_metadata() + + + self.add_cover() + + self.add_title_page() + self.add_toc() + + + + self.start_chunk() + + self.add_toc_entry( + None, + 'Początek utworu', # i18n + 0 + ) + self.output.guide.append({ + "type": "text", + "title": "Początek", + "href": "part1.xhtml" + }) + + + self.build_document(self.document) + + + self.close_chunk() + + self.add_annotations() + self.add_support_page() + self.add_last_page() + + + e = len(self.output.spine) - 3 - 3 + nfunds = len(FUNDRAISING) + if e > 16: + nfunds *= 2 + + # COUNTING CHARACTERS? + for f in range(nfunds): + spine_index = int(4 + (f / nfunds * e) + f) + + h = Xhtml() + h.body.append( + etree.XML('
' + FUNDRAISING[f % len(FUNDRAISING)] + '
') + ) + self.add_html(h.element, file_name='fund%d.xhtml' % f, spine=spine_index) + + self.add_fonts() + + output_file = tempfile.NamedTemporaryFile( + prefix='librarian', suffix='.epub', + delete=False) + output_file.close() + epub.write_epub(output_file.name, output, {'epub3_landmark': False}) + return OutputFile.from_filename(output_file.name) + + def build_document(self, document): + self.toc_precedences = [] + + self.start_chunk() + + + document.tree.getroot().epub_build(self) + if document.meta.parts: + self.start_chunk() + + self.start_element('div', {'class': 'title-page'}) + self.start_element('h1', {'class': 'title'}) + self.push_text(document.meta.title) + self.end_element() + self.end_element() + + ###### + # 160 + # translators + # working copy? + # ta lektura + # tanks + # utwor opracowany + # isbn + # logo + + for child in document.children: + self.start_chunk() + self.add_toc_entry(None, child.meta.title, 0) + self.build_document(child) + + self.shift_toc_base() + + + def add_title_page(self): + html = Xhtml() + html.title.text = "Strona tytułowa" + bt = etree.SubElement(html.body, 'div', **{'class': 'book-text'}) + tp = etree.SubElement(bt, 'div', **{'class': 'title-page'}) + + # Tak jak jest teraz – czy może być jednocześnie + # no „autor_utworu” + # i „dzieło nadrzędne” + # wcześniej mogło być dzieło nadrzędne, + + e = self.document.tree.find('//autor_utworu') + if e is not None: + etree.SubElement(tp, 'h2', **{'class': 'author'}).text = e.raw_printable_text() + e = self.document.tree.find('//nazwa_utworu') + if e is not None: + etree.SubElement(tp, 'h1', **{'class': 'title'}).text = e.raw_printable_text() + + if not len(tp): + for author in self.document.meta.authors: + etree.SubElement(tp, 'h2', **{'class': 'author'}).text = author.readable() + etree.SubElement(tp, 'h1', **{'class': 'title'}).text = self.document.meta.title + +# +# else: +# +# + + etree.SubElement(tp, 'p', **{"class": "info"}).text = '\u00a0' + + if self.document.meta.translators: + p = etree.SubElement(tp, 'p', **{'class': 'info'}) + p.text = 'tłum. ' + ', '.join(t.readable() for t in self.document.meta.translators) + + #

[Kopia robocza]

+ + p = etree.XML("""

+ Ta lektura, podobnie jak tysiące innych, jest dostępna on-line na stronie + wolnelektury.pl. +

""") + p[0].attrib['href'] = str(self.document.meta.url) + tp.append(p) + + if self.document.meta.thanks: + etree.SubElement(tp, 'p', **{'class': 'info'}).text = self.document.meta.thanks + + tp.append(etree.XML(""" +

+ Utwór opracowany został w ramach projektu Wolne Lektury przez fundację Nowoczesna Polska. +

+ """)) + + if self.document.meta.isbn_epub: + etree.SubElement(tp, 'p', **{"class": "info"}).text = self.document.meta.isbn_epub + + tp.append(etree.XML("""""")) + + self.add_html( + html.element, + file_name='title.xhtml', + spine=True, + toc='Strona tytułowa' # TODO: i18n + ) + + self.add_file( + get_resource('res/wl-logo-small.png'), + file_name='logo_wolnelektury.png', + media_type='image/png' + ) + + def set_metadata(self): + self.output.set_identifier( + str(self.document.meta.url)) + self.output.set_language( + functions.lang_code_3to2(self.document.meta.language) + ) + self.output.set_title(self.document.meta.title) + + for i, author in enumerate(self.document.meta.authors): + self.output.add_author( + author.readable(), + file_as=six.text_type(author), + uid='creator{}'.format(i) + ) + for translator in self.document.meta.translators: + self.output.add_author( + translator.readable(), + file_as=six.text_type(translator), + role='trl', + uid='translator{}'.format(i) + ) + for publisher in self.document.meta.publisher: + self.output.add_metadata("DC", "publisher", publisher) + + self.output.add_metadata("DC", "date", self.document.meta.created_at) + + + + + def add_toc(self): + item = epub.EpubNav() + self.output.add_item(item) + self.output.spine.append(item) + self.output.add_item(epub.EpubNcx()) + + self.output.toc.append( + epub.Link( + "nav.xhtml", + "Spis treści", + "nav" + ) + ) + + + + def add_support_page(self): + self.add_file( + get_resource('epub/support.xhtml'), + spine=True, + toc='Wesprzyj Wolne Lektury' + ) + + self.add_file( + get_resource('res/jedenprocent.png'), + media_type='image/png' + ) + self.add_file( + get_resource('epub/style.css'), + media_type='text/css' + ) + + + def add_file(self, path=None, content=None, + media_type='application/xhtml+xml', + file_name=None, uid=None, + spine=False, toc=None): + + # update chars? + # jakieś tam ścieśnianie białych znaków? + + if content is None: + with open(path, 'rb') as f: + content = f.read() + if file_name is None: + file_name = path.rsplit('/', 1)[-1] + + if uid is None: + uid = file_name.split('.', 1)[0] + + item = epub.EpubItem( + uid=uid, + file_name=file_name, + media_type=media_type, + content=content + ) + + self.output.add_item(item) + if spine: + if spine is True: + self.output.spine.append(item) + else: + self.output.spine.insert(spine, item) + + if toc: + self.output.toc.append( + epub.Link( + file_name, + toc, + uid + ) + ) + + def add_html(self, html_tree, **kwargs): + html = etree.tostring( + html_tree, pretty_print=True, xml_declaration=True, + encoding="utf-8", + doctype='' + ) + + html = librarian.epub.squeeze_whitespace(html) + + self.add_file( + content=html, + **kwargs + ) + + + def add_fonts(self): + print("".join(sorted(self.chars))) + # TODO: optimizer + for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', + 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'): + self.add_file( + content=strip_font( + get_resource('fonts/' + fname), + self.chars + ), + file_name=fname, + media_type='font/ttf' + ) + + def start_chunk(self): + if getattr(self, 'current_chunk', None) is not None: + if not len(self.current_chunk): + return + self.close_chunk() + self.current_chunk = etree.Element( + 'div', + id="book-text" + ) + self.cursors[None] = self.current_chunk + self.current_cursors.append(self.current_chunk) + + self.section_number = 0 + + + def close_chunk(self): + assert self.cursor is self.current_chunk + ###### -- what if we're inside? + + chunk_no = getattr( + self, + 'chunk_counter', + 1 + ) + self.chunk_counter = chunk_no + 1 + + html = Xhtml() + html.body.append(self.current_chunk) + + self.add_html( + ## html container from template. + #self.current_chunk, + html.element, + file_name='part%d.xhtml' % chunk_no, + spine=True, + + ) + self.current_chunk = None + self.current_cursors.pop() + + def start_element(self, tag, attr): + self.current_cursors.append( + etree.SubElement(self.cursor, tag, **attr) + ) + + def end_element(self): + self.current_cursors.pop() + + def push_text(self, text): + self.chars.update(text) + if len(self.cursor): + self.cursor[-1].tail = (self.cursor[-1].tail or '') + text + else: + self.cursor.text = (self.cursor.text or '') + text + + + def assign_image_number(self): + image_number = getattr(self, 'image_number', 0) + self.image_number = image_number + 1 + return image_number + + def assign_footnote_number(self): + number = getattr(self, 'footnote_number', 1) + self.footnote_number = number + 1 + return number + + def assign_section_number(self): + number = getattr(self, 'section_number', 1) + self.section_number = number + 1 + return number + + def assign_mathml_number(self): + number = getattr(self, 'mathml_number', 0) + self.mathml_number = number + 1 + return number + + + def add_toc_entry(self, fragment, name, precedence): + if precedence: + while self.toc_precedences and self.toc_precedences[-1] >= precedence: + self.toc_precedences.pop() + else: + self.toc_precedences = [] + + real_level = self.toc_base + len(self.toc_precedences) + if precedence: + self.toc_precedences.append(precedence) + else: + self.toc_base += 1 + + part_number = getattr( + self, + 'chunk_counter', + 1 + ) + filename = 'part%d.xhtml' % part_number + uid = filename.split('.')[0] + if fragment: + filename += '#' + fragment + uid += '-' + fragment + + toc = self.output.toc + for l in range(1, real_level): + if isinstance(toc[-1], epub.Link): + toc[-1] = [toc[-1], []] + toc = toc[-1][1] + + toc.append( + epub.Link( + filename, + name, + uid + ) + ) + + def shift_toc_base(self): + self.toc_base -= 1 + + + def add_last_page(self): + html = Xhtml() + m = self.document.meta + + html.title.text = 'Strona redakcyjna' + d = etree.SubElement(html.body, 'div', id='book-text') + + newp = lambda: etree.SubElement(d, 'p', {'class': 'info'}) + + p = newp() + if m.license: + p.text = """ + Ten utwór jest udostępniony na licencji + """ + etree.SubElement(p, 'a', href=m.license).text = m.license_description + else: + p.text = """ + Ten utwór nie jest objęty majątkowym prawem autorskim i znajduje się w domenie + publicznej, co oznacza że możesz go swobodnie wykorzystywać, publikować + i rozpowszechniać. Jeśli utwór opatrzony jest dodatkowymi materiałami + (przypisy, motywy literackie etc.), które podlegają prawu autorskiemu, to + te dodatkowe materiały udostępnione są na licencji + """ + a = etree.SubElement(p, "a", href="http://creativecommons.org/licenses/by-sa/3.0/") + a.text = """Creative Commons + Uznanie Autorstwa – Na Tych Samych Warunkach 3.0 PL""" + a.tail = "." + + + p = newp() + p.text = 'Źródło: ' + etree.SubElement( + p, 'a', href=str(m.url), + title=', '.join(( + ', '.join(p.readable() for p in m.authors), + m.title + )) + ).text = str(m.url) + + newp().text = 'Tekst opracowany na podstawie: ' + m.source_name + + newp().text = """ + Wydawca: + """ + ", ".join(p for p in m.publisher) + + if m.description: + newp().text = m.description + + + if m.editors: + newp().text = 'Opracowanie redakcyjne i przypisy: %s.' % ( + ', '.join(e.readable() for e in sorted(self.document.editors()))) + + if m.funders: + etree.SubElement(d, 'p', {'class': 'minor-info'}).text = '''Publikację wsparli i wsparły: + %s.''' % (', '.join(m.funders)) + + if m.cover_by: + p = newp() + p.text = 'Okładka na podstawie: ' + if m.cover_source: + etree.SubElement( + p, + 'a', + href=m.cover_source + ).text = m.cover_by + else: + p.text += m.cover_by + + if m.isbn_epub: + newp().text = m.isbn_epub + + newp().text = '\u00a0' + + p = newp() + p.attrib['class'] = 'minor-info' + p.text = ''' + Plik wygenerowany dnia ''' + span = etree.SubElement(p, 'span', id='file_date') + span.text = str(date.today()) + span.tail = '''. + ''' + + self.add_html( + html.element, + file_name='last.xhtml', + toc='Strona redakcyjna', + spine=True + ) + + + def add_annotations(self): + if not len(self.footnotes): + return + + html = Xhtml() + html.title.text = 'Przypisy' + d = etree.SubElement( + etree.SubElement( + html.body, + 'div', + id='book-text' + ), + 'div', + id='footnotes' + ) + + etree.SubElement( + d, + 'h2', + ).text = 'Przypisy:' + + d.extend(self.footnotes) + + self.add_html( + html.element, + file_name='annotations.xhtml', + spine=True, + toc='Przypisy' + ) + + def add_cover(self): + # TODO: allow other covers + + cover_maker = make_cover + + cover_file = six.BytesIO() + cover = cover_maker(self.document.meta) + cover.save(cover_file) + cover_name = 'cover.%s' % cover.ext() + + self.output.set_cover( + file_name=cover_name, + content=cover_file.getvalue(), + create_page = False + ) + ci = (''' + + + + Okładka + + + + Okładka + +''' % cover.ext()).encode('utf-8') + self.add_file(file_name='cover.xhtml', content=ci) + + self.output.spine.append('cover') + self.output.guide.append({ + 'type': 'cover', + 'href': 'cover.xhtml', + 'title': 'Okładka' + }) + + def mathml(self, element): + name = "math%d.png" % self.assign_mathml_number() + self.add_file( + content=MathML(element).to_latex().to_png().data, + media_type='image/png', + file_name=name + ) + return name