X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/8550d172b829f29b2fcd4723789fb2a5d9fff6eb..7bd7f505b6420e7667b29d5aa7e6a2e3a2a520bd:/src/librarian/epub.py?ds=sidebyside diff --git a/src/librarian/epub.py b/src/librarian/epub.py index 137796e..35766b8 100644 --- a/src/librarian/epub.py +++ b/src/librarian/epub.py @@ -9,16 +9,16 @@ import os import os.path import re import subprocess -from six import BytesIO +import six from copy import deepcopy from mimetypes import guess_type +from ebooklib import epub from lxml import etree -import zipfile from tempfile import mkdtemp, NamedTemporaryFile from shutil import rmtree -from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile +from librarian import RDFNS, WLNS, DCNS, OutputFile from librarian.cover import make_cover from librarian import functions, get_resource @@ -26,7 +26,6 @@ from librarian import functions, get_resource from librarian.hyphenator import Hyphenator functions.reg_person_name() -functions.reg_lang_code_3to2() def squeeze_whitespace(s): @@ -34,21 +33,9 @@ def squeeze_whitespace(s): def set_hyph_language(source_tree): - def get_short_lng_code(text): - result = '' - text = ''.join(text) - with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f: - for line in f.read().decode('latin1').split('\n'): - list = line.strip().split('|') - if list[0] == text: - result = list[2] - if result == '': - return text - else: - return result bibl_lng = etree.XPath('//dc:language//text()', namespaces={'dc': str(DCNS)})(source_tree) - short_lng = get_short_lng_code(bibl_lng[0]) + short_lng = functions.lang_code_3to2(bibl_lng[0]) try: return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' + short_lng + '.dic')) @@ -249,104 +236,6 @@ def replace_by_verse(tree): Stanza(stanza).versify() -def add_to_manifest(manifest, partno): - """ Adds a node to the manifest section in content.opf file """ - - partstr = 'part%d' % partno - e = manifest.makeelement( - OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html', - 'media-type': 'application/xhtml+xml'} - ) - manifest.append(e) - - -def add_to_spine(spine, partno): - """ Adds a node to the spine section in content.opf file """ - - e = spine.makeelement( - OPFNS('itemref'), - attrib={'idref': 'part%d' % partno} - ) - spine.append(e) - - -class TOC(object): - def __init__(self, name=None, part_href=None): - self.children = [] - self.name = name - self.part_href = part_href - self.sub_number = None - - def add(self, name, part_href, level=0, is_part=True, index=None): - assert level == 0 or index is None - if level > 0 and self.children: - return self.children[-1].add(name, part_href, level - 1, is_part) - else: - t = TOC(name) - t.part_href = part_href - if index is not None: - self.children.insert(index, t) - else: - self.children.append(t) - if not is_part: - t.sub_number = len(self.children) + 1 - return t.sub_number - - def append(self, toc): - self.children.append(toc) - - def extend(self, toc): - self.children.extend(toc.children) - - def depth(self): - if self.children: - return max((c.depth() for c in self.children)) + 1 - else: - return 0 - - def href(self): - src = self.part_href - if self.sub_number is not None: - src += '#sub%d' % self.sub_number - return src - - def write_to_xml(self, nav_map, counter=1): - for child in self.children: - nav_point = nav_map.makeelement(NCXNS('navPoint')) - nav_point.set('id', 'NavPoint-%d' % counter) - nav_point.set('playOrder', str(counter)) - - nav_label = nav_map.makeelement(NCXNS('navLabel')) - text = nav_map.makeelement(NCXNS('text')) - if child.name is not None: - text.text = re.sub(r'\n', ' ', child.name) - else: - text.text = child.name - nav_label.append(text) - nav_point.append(nav_label) - - content = nav_map.makeelement(NCXNS('content')) - content.set('src', child.href()) - nav_point.append(content) - nav_map.append(nav_point) - counter = child.write_to_xml(nav_point, counter + 1) - return counter - - def html_part(self, depth=0): - texts = [] - for child in self.children: - texts.append( - "
" % - (depth, child.href(), child.name)) - texts.append(child.html_part(depth + 1)) - return "\n".join(texts) - - def html(self): - with open(get_resource('epub/toc.html'), 'rb') as f: - t = f.read().decode('utf-8') - return t % self.html_part() - - def used_chars(element): """ Lists characters used in an ETree Element """ chars = set((element.text or '') + (element.tail or '')) @@ -413,19 +302,55 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, and a set of used characters. """ - toc = TOC() + toc = [] for element in chunk_xml[0]: if element.tag == "naglowek_czesc": - toc.add(node_name(element), "part%d.html#book-text" % chunk_no) + toc.append( + ( + epub.Link( + "part%d.xhtml#book-text" % chunk_no, + node_name(element), + "part%d-text" % chunk_no + ), + [] + ) + ) elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"): - toc.add(node_name(element), "part%d.html" % chunk_no) + toc.append( + ( + epub.Link( + "part%d.xhtml" % chunk_no, + node_name(element), + "part%d" % chunk_no + ), + [] + ) + ) elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'): - subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, - level=1, is_part=False) - element.set('sub', str(subnumber)) + if not toc: + toc.append( + ( + epub.Link( + "part%d.xhtml" % chunk_no, + " ", + "part%d" % chunk_no + ), + [] + ) + ) + + subnumber = len(toc[-1][1]) + toc[-1][1].append( + epub.Link( + "part%d.xhtml#sub%d" % (chunk_no, subnumber), + node_name(element), + "part%d-sub%d" % (chunk_no, subnumber) + ) + ) + element.set('sub', six.text_type(subnumber)) if empty: if not _empty_html_static: - with open(get_resource('epub/emptyChunk.html')) as f: + with open(get_resource('epub/emptyChunk.xhtml')) as f: _empty_html_static.append(f.read()) chars = set() output_html = _empty_html_static[0] @@ -437,13 +362,21 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, output_html = etree.tostring( html_tree, pretty_print=True, xml_declaration=True, encoding="utf-8", - doctype='' + doctype='' ) return output_html, toc, chars -def transform(wldoc, verbose=False, style=None, html_toc=False, +def remove_empty_lists_from_toc(toc): + for i, e in enumerate(toc): + if isinstance(e, tuple): + if e[1]: + remove_empty_lists_from_toc(e[1]) + else: + toc[i] = e[0] + + +def transform(wldoc, verbose=False, style=None, sample=None, cover=None, flags=None, hyphenate=False, ilustr_path='', output_type='epub'): """ produces a EPUB file @@ -465,7 +398,16 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, # every input file will have a TOC entry, # pointing to starting chunk - toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter) + toc = [ + ( + epub.Link( + "part%d.xhtml" % chunk_counter, + wldoc.book_info.title, + "path%d-start" % chunk_counter + ), + [] + ) + ] chars = set() if first: # write book title page @@ -475,17 +417,42 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, html_string = etree.tostring( html_tree, pretty_print=True, xml_declaration=True, encoding="utf-8", - doctype='' + doctype='' + ) + item = epub.EpubItem( + uid="titlePage", + file_name="title.xhtml", + media_type="application/xhtml+xml", + content=squeeze_whitespace(html_string) ) - zip.writestr('OPS/title.html', squeeze_whitespace(html_string)) + spine.append(item) + output.add_item(item) # add a title page TOC entry - toc.add(u"Strona tytuÅowa", "title.html") + toc[-1][1].append( + epub.Link( + "title.xhtml", + "Strona tytuÅowa", + "title", + ) + ) + + item = epub.EpubNav() + toc[-1][1].append( + epub.Link( + "nav.xhtml", + "Spis treÅci", + "nav" + ) + ) + output.add_item(item) + spine.append(item) + elif wldoc.book_info.parts: # write title page for every parent if sample is not None and sample <= 0: chars = set() - html_string = open(get_resource('epub/emptyChunk.html')).read() + html_string = open( + get_resource('epub/emptyChunk.xhtml')).read() else: html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl')) @@ -493,13 +460,17 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, html_string = etree.tostring( html_tree, pretty_print=True, xml_declaration=True, encoding="utf-8", - doctype='' + doctype='' ) - zip.writestr('OPS/part%d.html' % chunk_counter, - squeeze_whitespace(html_string)) - add_to_manifest(manifest, chunk_counter) - add_to_spine(spine, chunk_counter) + item = epub.EpubItem( + uid="part%d" % chunk_counter, + file_name="part%d.xhtml" % chunk_counter, + media_type="application/xhtml+xml", + content=squeeze_whitespace(html_string) + ) + output.add_item(item) + spine.append(item) + chunk_counter += 1 if len(wldoc.edoc.getroot()) > 1: @@ -524,18 +495,22 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, chunk_html, chunk_toc, chunk_chars = transform_chunk( chunk_xml, chunk_counter, annotations, empty) - toc.extend(chunk_toc) + toc[-1][1].extend(chunk_toc) chars = chars.union(chunk_chars) - zip.writestr('OPS/part%d.html' % chunk_counter, - squeeze_whitespace(chunk_html)) - add_to_manifest(manifest, chunk_counter) - add_to_spine(spine, chunk_counter) + item = epub.EpubItem( + uid="part%d" % chunk_counter, + file_name="part%d.xhtml" % chunk_counter, + media_type="application/xhtml+xml", + content=squeeze_whitespace(chunk_html) + ) + output.add_item(item) + spine.append(item) chunk_counter += 1 for child in wldoc.parts(): child_toc, chunk_counter, chunk_chars, sample = transform_file( child, chunk_counter, first=False, sample=sample) - toc.append(child_toc) + toc[-1][1].extend(child_toc) chars = chars.union(chunk_chars) return toc, chunk_counter, chars, sample @@ -561,17 +536,36 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, if document.book_info.thanks: document.edoc.getroot().set('thanks', document.book_info.thanks) - opf = xslt(document.book_info.to_etree(), - get_resource('epub/xsltContent.xsl')) - manifest = opf.find('.//' + OPFNS('manifest')) - guide = opf.find('.//' + OPFNS('guide')) - spine = opf.find('.//' + OPFNS('spine')) + output = epub.EpubBook() + output.set_identifier(six.text_type(document.book_info.url)) + output.set_language(functions.lang_code_3to2(document.book_info.language)) + output.set_title(document.book_info.title) + for author in document.book_info.authors: + output.add_author( + author.readable(), + file_as=six.text_type(author) + ) + for translator in document.book_info.translators: + output.add_author( + translator.readable(), + file_as=six.text_type(translator), + role='translator' + ) + for publisher in document.book_info.publisher: + output.add_metadata("DC", "publisher", publisher) + output.add_metadata("DC", "date", document.book_info.created_at) - output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', - delete=False) - zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) + output.guide.append({ + "type": "text", + "title": "PoczÄ tek", + "href": "part1.xhtml" + }) + + output.add_item(epub.EpubNcx()) + + spine = output.spine - functions.reg_mathml_epub(zip) + functions.reg_mathml_epub(output) if os.path.isdir(ilustr_path): ilustr_elements = set(ilustr.get('src') @@ -580,55 +574,70 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, if filename not in ilustr_elements: continue file_path = os.path.join(ilustr_path, filename) - zip.write(file_path, os.path.join('OPS', filename)) - image_id = 'image%s' % i - manifest.append(etree.fromstring( - '