X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/ca8319931f449468918067367133ff25f9b19f30..refs/heads/secondary:/src/librarian/epub.py?ds=sidebyside diff --git a/src/librarian/epub.py b/src/librarian/epub.py index a8c6680..a3931b5 100644 --- a/src/librarian/epub.py +++ b/src/librarian/epub.py @@ -15,6 +15,7 @@ from mimetypes import guess_type from ebooklib import epub from lxml import etree +from PIL import Image from tempfile import mkdtemp, NamedTemporaryFile from shutil import rmtree @@ -29,6 +30,7 @@ functions.reg_person_name() def squeeze_whitespace(s): + return s return re.sub(b'\\s+', b' ', s) @@ -61,33 +63,6 @@ def hyphenate_and_fix_conjunctions(source_tree, hyph): parent.tail = newt -def inner_xml(node): - """ returns node's text and children as a string - - >>> print(inner_xml(etree.fromstring('xyz'))) - xyz - """ - - nt = node.text if node.text is not None else '' - return ''.join( - [nt] + [etree.tostring(child, encoding='unicode') for child in node] - ) - - -def set_inner_xml(node, text): - """ sets node's text and children from a string - - >>> e = etree.fromstring('bxx') - >>> set_inner_xml(e, 'xyz') - >>> print(etree.tostring(e, encoding='unicode')) - xyz - """ - - p = etree.fromstring('%s' % text) - node.text = p.text - node[:] = p[:] - - def node_name(node): """ Find out a node's name @@ -327,6 +302,18 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, ) ) elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'): + if not toc: + toc.append( + ( + epub.Link( + "part%d.xhtml" % chunk_no, + " ", + "part%d" % chunk_no + ), + [] + ) + ) + subnumber = len(toc[-1][1]) toc[-1][1].append( epub.Link( @@ -364,17 +351,8 @@ def remove_empty_lists_from_toc(toc): toc[i] = e[0] -def transform(wldoc, verbose=False, style=None, - sample=None, cover=None, flags=None, hyphenate=False, - ilustr_path='', output_type='epub'): - """ produces a EPUB file - - sample=n: generate sample e-book (with at least n paragraphs) - cover: a cover.Cover factory or True for default - flags: less-advertising, without-fonts, working-copy - """ - def transform_file(wldoc, chunk_counter=1, first=True, sample=None): +def transform_file(wldoc, chunk_counter=1, first=True, sample=None, hyphenate=False, output_type='epub', spine=None, output=None, annotations=None): """ processes one input file and proceeds to its children """ replace_characters(wldoc.edoc.getroot()) @@ -435,6 +413,14 @@ def transform(wldoc, verbose=False, style=None, output.add_item(item) spine.append(item) + toc[-1][1].append( + epub.Link( + "part1.xhtml", + "Początek utworu", + "part1" + ) + ) + elif wldoc.book_info.parts: # write title page for every parent if sample is not None and sample <= 0: @@ -497,12 +483,27 @@ def transform(wldoc, verbose=False, style=None, for child in wldoc.parts(): child_toc, chunk_counter, chunk_chars, sample = transform_file( - child, chunk_counter, first=False, sample=sample) + child, chunk_counter, first=False, sample=sample, + hyphenate=hyphenate, output_type=output_type, + spine=spine, output=output, annotations=annotations, + ) toc[-1][1].extend(child_toc) chars = chars.union(chunk_chars) return toc, chunk_counter, chars, sample + +def transform(wldoc, verbose=False, style=None, + sample=None, cover=None, flags=None, hyphenate=False, + base_url='file://./', output_type='epub'): + """ produces a EPUB file + + sample=n: generate sample e-book (with at least n paragraphs) + cover: a cover.Cover factory or True for default + flags: less-advertising, without-fonts, working-copy + """ + + document = deepcopy(wldoc) del wldoc @@ -528,16 +529,18 @@ def transform(wldoc, verbose=False, style=None, output.set_identifier(six.text_type(document.book_info.url)) output.set_language(functions.lang_code_3to2(document.book_info.language)) output.set_title(document.book_info.title) - for author in document.book_info.authors: + for i, author in enumerate(document.book_info.authors): output.add_author( author.readable(), - file_as=six.text_type(author) + file_as=six.text_type(author), + uid='creator{}'.format(i) ) for translator in document.book_info.translators: output.add_author( translator.readable(), file_as=six.text_type(translator), - role='translator' + role='trl', + uid='translator{}'.format(i) ) for publisher in document.book_info.publisher: output.add_metadata("DC", "publisher", publisher) @@ -553,25 +556,44 @@ def transform(wldoc, verbose=False, style=None, spine = output.spine - functions.reg_mathml_epub(zip) + functions.reg_mathml_epub(output) - if os.path.isdir(ilustr_path): - ilustr_elements = set(ilustr.get('src') - for ilustr in document.edoc.findall('//ilustr')) - for i, filename in enumerate(os.listdir(ilustr_path)): - if filename not in ilustr_elements: - continue - file_path = os.path.join(ilustr_path, filename) - with open(file_path, 'rb') as f: - output.add_item( - epub.EpubItem( - uid='image%s' % i, - file_name=filename, - media_type=guess_type(file_path)[0], - content=f.read() - ) - ) + # FIXME + for i, ilustr in enumerate(document.edoc.findall('//ilustr')): + url = six.moves.urllib.parse.urljoin( + base_url, + ilustr.get('src') + ) + imgfile = six.moves.urllib.request.urlopen(url) + img = Image.open(imgfile) + + th_format, ext, media_type = { + 'GIF': ('GIF', 'gif', 'image/gif'), + 'PNG': ('PNG', 'png', 'image/png'), + }.get(img.format, ('JPEG', 'jpg', 'image/jpeg')) + + width = 1200 + if img.size[0] < width: + th = img + else: + th = img.resize((width, round(width * img.size[1] / img.size[0]))) + imgfile.close() + + buffer = six.BytesIO() + th.save(buffer, format=th_format) + + file_name = 'image%d.%s' % (i, ext) + ilustr.set('src', file_name) + output.add_item( + epub.EpubItem( + uid='image%s' % i, + file_name=file_name, + media_type=media_type, + content=buffer.getvalue() + ) + ) + # write static elements with open(get_resource('res/wl-logo-small.png'), 'rb') as f: @@ -637,21 +659,16 @@ def transform(wldoc, verbose=False, style=None, annotations = etree.Element('annotations') - toc, chunk_counter, chars, sample = transform_file(document, sample=sample) + toc, chunk_counter, chars, sample = transform_file( + document, sample=sample, + hyphenate=hyphenate, output_type=output_type, + spine=spine, output=output, annotations=annotations + ) output.toc = toc[0][1] - if len(toc) < 2: - toc.append( - epub.Link( - "part1.xhtml", - "Początek utworu", - "part1" - ) - ) - # Last modifications in container files and EPUB creation if len(annotations) > 0: - toc.append( + output.toc.append( epub.Link( "annotations.xhtml", "Przypisy", @@ -675,7 +692,7 @@ def transform(wldoc, verbose=False, style=None, output.add_item(item) spine.append(item) - toc.append( + output.toc.append( epub.Link( "support.xhtml", "Wesprzyj Wolne Lektury", @@ -694,7 +711,7 @@ def transform(wldoc, verbose=False, style=None, output.add_item(item) spine.append(item) - toc.append( + output.toc.append( epub.Link( "last.xhtml", "Strona redakcyjna",