From 83cae63af4330912cdb2546c195af2919afd30ac Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Fri, 5 Feb 2021 22:13:41 +0100 Subject: [PATCH] Images: image@src is a URL, and image sizes are limited. --- CHANGELOG.md | 9 +++ scripts/book2epub | 11 +++- scripts/book2html | 6 +- scripts/book2mobi | 14 +++-- scripts/book2pdf | 16 ++++-- setup.py | 2 +- src/librarian/builders/daisy.py | 2 +- src/librarian/builders/html.py | 13 ++++- src/librarian/command_line.py | 8 ++- src/librarian/document.py | 4 +- src/librarian/elements/figures/ilustr.py | 2 +- src/librarian/elements/footnotes/__init__.py | 3 + src/librarian/epub.py | 52 +++++++++++------ src/librarian/html.py | 60 +++++++++++++------- src/librarian/mobi.py | 4 +- src/librarian/pdf.py | 27 ++++++++- tests/test_html.py | 3 +- tests/test_text.py | 3 +- tox.ini | 3 +- 19 files changed, 177 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f17bfb..4efa62a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ This document records all notable changes to Librarian. +## 1.14 (2021-02-05) + +### Changed +- Image sources are now URLs. This changes the API: instead of paths + given as `ilustr_path`, `transform` functions now accept + a new `base_url` parameter. +- Size limits introduced for images in all formats. + + ## 1.13 (2021-01-27) ### Changed diff --git a/scripts/book2epub b/scripts/book2epub index c1027c5..f4d5617 100755 --- a/scripts/book2epub +++ b/scripts/book2epub @@ -17,8 +17,15 @@ class Book2Epub(Book2Anything): transform_flags = [ Option('-w', '--working-copy', dest='working-copy', action='store_true', default=False, - help='mark the output as a working copy') - ] + help='mark the output as a working copy' + ) + ] + transform_options = [ + Option( + '-b', '--base-url', dest='base_url', metavar='URL', + help='specifies the base URL for relative image references' + ), + ] if __name__ == '__main__': diff --git a/scripts/book2html b/scripts/book2html index 02d542c..66b8999 100755 --- a/scripts/book2html +++ b/scripts/book2html @@ -20,7 +20,11 @@ class Book2Html(Book2Anything): help='output raw text for use in templates') ] transform_options = [ - Option('--css', dest='css') + Option('--css', dest='css'), + Option( + '-b', '--base-url', dest='base_url', metavar='URL', + help='specifies the base URL for relative image references' + ), ] parser_options = [ Option('-i', '--ignore-dublin-core', dest='parse_dublincore', diff --git a/scripts/book2mobi b/scripts/book2mobi index b0d0686..174ef57 100755 --- a/scripts/book2mobi +++ b/scripts/book2mobi @@ -17,10 +17,16 @@ class Book2Mobi(Book2Anything): uses_provider = True transform_options = [ - Option('-k', '--use-kindlegen', - action='store_true', dest='use_kindlegen', default=False, - help='use kindlegen tool instead of Calibre') - ] + Option( + '-k', '--use-kindlegen', + action='store_true', dest='use_kindlegen', default=False, + help='use kindlegen tool instead of Calibre' + ), + Option( + '-b', '--base-url', dest='base_url', metavar='URL', + help='specifies the base URL for relative image references' + ), + ] if __name__ == '__main__': diff --git a/scripts/book2pdf b/scripts/book2pdf index 3c363f1..b3f422e 100755 --- a/scripts/book2pdf +++ b/scripts/book2pdf @@ -15,10 +15,18 @@ class Book2Pdf(Book2Anything): uses_cover = True uses_provider = True transform_options = [ - Option('-t', '--save-tex', dest='save_tex', metavar='FILE', - help='path to save the intermediary LaTeX file to'), - Option('-m', '--morefloats', dest='morefloats', metavar='old/new/none', - help='force morefloats in old (<1.0c), new (>=1.0c) or none') + Option( + '-t', '--save-tex', dest='save_tex', metavar='FILE', + help='path to save the intermediary LaTeX file to' + ), + Option( + '-m', '--morefloats', dest='morefloats', metavar='old/new/none', + help='force morefloats in old (<1.0c), new (>=1.0c) or none' + ), + Option( + '-b', '--base-url', dest='base_url', metavar='URL', + help='specifies the base URL for relative image references' + ), ] diff --git a/setup.py b/setup.py index c1af4e3..5b6f487 100755 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def whole_tree(prefix, path): setup( name='librarian', - version='1.13', + version='1.14', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', author="Marek Stępniowski", author_email='marek@stepniowski.com', diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py index 38e5338..a117895 100644 --- a/src/librarian/builders/daisy.py +++ b/src/librarian/builders/daisy.py @@ -173,7 +173,7 @@ class DaisyBuilder: **{ "clip-begin": "npt=%.3fs" % item[0], "clip-end": "npt=%.3fs" % item[1], - }, + } ) zipf.writestr( diff --git a/src/librarian/builders/html.py b/src/librarian/builders/html.py index ee50cb8..5096e28 100644 --- a/src/librarian/builders/html.py +++ b/src/librarian/builders/html.py @@ -19,8 +19,8 @@ class HtmlBuilder: with_nota_red = True no_externalities = False - def __init__(self, image_location='https://wolnelektury.pl/media/book/pictures/marcos-historia-kolorow/'): - self.image_location = image_location + def __init__(self, base_url=None): + self._base_url = base_url self.tree = text = etree.Element('div', **{'id': 'book-text'}) self.header = etree.SubElement(text, 'h1') @@ -38,6 +38,13 @@ class HtmlBuilder: } self.current_cursors = [text] + @property + def base_url(self): + if self._base_url is not None: + return self._base_url + else: + return 'https://wolnelektury.pl/media/book/pictures/{}/'.format(self.document.meta.url.slug) + @property def cursor(self): return self.current_cursors[-1] @@ -60,6 +67,8 @@ class HtmlBuilder: document._compat_assign_section_ids() def build(self, document, **kwargs): + self.document = document + self.preprocess(document) document.tree.getroot().html_build(self) self.postprocess(document) diff --git a/src/librarian/command_line.py b/src/librarian/command_line.py index e7021bb..2fc7cc1 100644 --- a/src/librarian/command_line.py +++ b/src/librarian/command_line.py @@ -22,13 +22,19 @@ def main(*args, **kwargs): help='specifies the directory for output' ) + # Specific + parser.add_argument( + '-b', '--base-url', metavar="URL", + help="Base for relative URLs in documents (like image sources)" + ) + parser.add_argument( '--mp3', metavar="FILE", nargs="*", help='specifies an MP3 file, if needed' ) - + args = parser.parse_args() builder = builders[args.builder] diff --git a/src/librarian/document.py b/src/librarian/document.py index 1c8f223..6e94ff2 100644 --- a/src/librarian/document.py +++ b/src/librarian/document.py @@ -1,8 +1,8 @@ import gettext import os import re -from urllib.request import urlopen from lxml import etree +import six from .parser import parser from . import dcparser, DCNS from .functions import lang_code_3to2 @@ -10,7 +10,7 @@ from .functions import lang_code_3to2 class WLDocument: def __init__(self, filename=None, url=None): - source = filename or urlopen(url) + source = filename or six.moves.urllib.request.urlopen(url) tree = etree.parse(source, parser=parser) self.tree = tree tree.getroot().document = self diff --git a/src/librarian/elements/figures/ilustr.py b/src/librarian/elements/figures/ilustr.py index ee82b08..3c3026c 100644 --- a/src/librarian/elements/figures/ilustr.py +++ b/src/librarian/elements/figures/ilustr.py @@ -6,7 +6,7 @@ class Ilustr(WLElement): def get_html_attr(self, builder): return { - 'src': builder.image_location + self.attrib['src'], + 'src': builder.base_url + self.attrib['src'], 'alt': self.attrib['alt'], 'title': self.attrib['alt'], } diff --git a/src/librarian/elements/footnotes/__init__.py b/src/librarian/elements/footnotes/__init__.py index d0a0bf1..0f30747 100644 --- a/src/librarian/elements/footnotes/__init__.py +++ b/src/librarian/elements/footnotes/__init__.py @@ -1,3 +1,6 @@ +# -*- coding: utf-8 +from __future__ import unicode_literals + from ..base import WLElement diff --git a/src/librarian/epub.py b/src/librarian/epub.py index fc5ee16..5a88d5a 100644 --- a/src/librarian/epub.py +++ b/src/librarian/epub.py @@ -15,6 +15,7 @@ from mimetypes import guess_type from ebooklib import epub from lxml import etree +from PIL import Image from tempfile import mkdtemp, NamedTemporaryFile from shutil import rmtree @@ -378,7 +379,7 @@ def remove_empty_lists_from_toc(toc): def transform(wldoc, verbose=False, style=None, sample=None, cover=None, flags=None, hyphenate=False, - ilustr_path='', output_type='epub'): + base_url='file://./', output_type='epub'): """ produces a EPUB file sample=n: generate sample e-book (with at least n paragraphs) @@ -567,23 +568,40 @@ def transform(wldoc, verbose=False, style=None, functions.reg_mathml_epub(output) - if os.path.isdir(ilustr_path): - ilustr_elements = set(ilustr.get('src') - for ilustr in document.edoc.findall('//ilustr')) - for i, filename in enumerate(os.listdir(ilustr_path)): - if filename not in ilustr_elements: - continue - file_path = os.path.join(ilustr_path, filename) - with open(file_path, 'rb') as f: - output.add_item( - epub.EpubItem( - uid='image%s' % i, - file_name=filename, - media_type=guess_type(file_path)[0], - content=f.read() - ) - ) + # FIXME + for i, ilustr in enumerate(document.edoc.findall('//ilustr')): + url = six.moves.urllib.parse.urljoin( + base_url, + ilustr.get('src') + ) + with six.moves.urllib.request.urlopen(url) as imgfile: + img = Image.open(imgfile) + + th_format, ext, media_type = { + 'GIF': ('GIF', 'gif', 'image/gif'), + 'PNG': ('PNG', 'png', 'image/png'), + }.get(img.format, ('JPEG', 'jpg', 'image/jpeg')) + width = 1200 + if img.size[0] < width: + th = img + else: + th = img.resize((width, round(width * img.size[1] / img.size[0]))) + + buffer = six.BytesIO() + th.save(buffer, format=th_format) + + file_name = 'image%d.%s' % (i, ext) + ilustr.set('src', file_name) + output.add_item( + epub.EpubItem( + uid='image%s' % i, + file_name=file_name, + media_type=media_type, + content=buffer.getvalue() + ) + ) + # write static elements with open(get_resource('res/wl-logo-small.png'), 'rb') as f: diff --git a/src/librarian/html.py b/src/librarian/html.py index d262198..363286c 100644 --- a/src/librarian/html.py +++ b/src/librarian/html.py @@ -51,32 +51,45 @@ def transform_abstrakt(abstrakt_element): return re.sub(']*>', '', html) -def add_image_sizes(tree, gallery_path, gallery_url): - widths = [360, 600, 1200, 1800] - for ilustr in tree.findall('//ilustr'): +def add_image_sizes(tree, gallery_path, gallery_url, base_url): + widths = [360, 600, 1200, 1800, 2400] + + for i, ilustr in enumerate(tree.findall('//ilustr')): rel_path = ilustr.attrib['src'] - img = Image.open(gallery_path + rel_path) + img_url = six.moves.urllib.parse.urljoin(base_url, rel_path) + + with six.moves.urllib.request.urlopen(img_url) as f: + img = Image.open(f) + + ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg') + srcset = [] + # Needed widths: predefined and original, limited by + # whichever is smaller. + img_widths = [ + w for w in + sorted( + set(widths + [img.size[0]]) + ) + if w <= min(widths[-1], img.size[0]) + ] + largest = None for w in widths: - if w < img.size[0]: - height = round(img.size[1] * w / img.size[0]) - th = img.resize((w, height)) - - fname = ('.W%d.' % w).join(rel_path.rsplit('.', 1)) - th.save(gallery_path + fname) - srcset.append(" ".join(( - gallery_url + fname, - '%dw' % w - ))) - srcset.append(" ".join(( - gallery_url + rel_path, - '%dw' % img.size[0] - ))) + height = round(img.size[1] * w / img.size[0]) + th = img.resize((w, height)) + fname = '%d.W%d.%s' % (i, w, ext) + th.save(gallery_path + fname) + th_url = gallery_url + fname + srcset.append(" ".join(( + th_url, + '%dw' % w + ))) + largest_url = th_url ilustr.attrib['srcset'] = ", ".join(srcset) - ilustr.attrib['src'] = gallery_url + rel_path + ilustr.attrib['src'] = largest_url -def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/'): +def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'): """Transforms the WL document to XHTML. If output_filename is None, returns an XML, @@ -102,7 +115,12 @@ def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, ga if not options: options = {} - add_image_sizes(document.edoc, gallery_path, gallery_url) + try: + os.makedirs(gallery_path) + except OSError: + pass + + add_image_sizes(document.edoc, gallery_path, gallery_url, base_url) css = ( css diff --git a/src/librarian/mobi.py b/src/librarian/mobi.py index 337db75..d6c2515 100644 --- a/src/librarian/mobi.py +++ b/src/librarian/mobi.py @@ -14,7 +14,7 @@ from librarian import OutputFile def transform(wldoc, verbose=False, sample=None, cover=None, - use_kindlegen=False, flags=None, hyphenate=True, ilustr_path='', + use_kindlegen=False, flags=None, hyphenate=True, base_url='', converter_path=None): """ produces a MOBI file @@ -31,7 +31,7 @@ def transform(wldoc, verbose=False, sample=None, cover=None, epub = document.as_epub(verbose=verbose, sample=sample, cover=cover or True, flags=flags, - hyphenate=hyphenate, ilustr_path=ilustr_path, + hyphenate=hyphenate, base_url=base_url, output_type='mobi') if verbose: kwargs = {} diff --git a/src/librarian/pdf.py b/src/librarian/pdf.py index a51dbb5..cad66a4 100644 --- a/src/librarian/pdf.py +++ b/src/librarian/pdf.py @@ -20,6 +20,7 @@ from copy import deepcopy from subprocess import call, PIPE from itertools import chain +from PIL import Image from Texml.processor import process from lxml import etree from lxml.etree import XMLSyntaxError, XSLTApplyError @@ -242,7 +243,7 @@ def package_available(package, args='', verbose=False): def transform(wldoc, verbose=False, save_tex=None, morefloats=None, - cover=None, flags=None, customizations=None, ilustr_path='', + cover=None, flags=None, customizations=None, base_url='file://./', latex_dir=False): """ produces a PDF file with XeLaTeX @@ -314,8 +315,28 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, # TeXML -> LaTeX temp = mkdtemp('-wl2pdf') - for ilustr in document.edoc.findall("//ilustr"): - shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp) + for i, ilustr in enumerate(document.edoc.findall('//ilustr')): + url = six.moves.urllib.parse.urljoin( + base_url, + ilustr.get('src') + ) + with six.moves.urllib.request.urlopen(url) as imgfile: + img = Image.open(imgfile) + + th_format, ext, media_type = { + 'GIF': ('GIF', 'gif', 'image/gif'), + 'PNG': ('PNG', 'png', 'image/png'), + }.get(img.format, ('JPEG', 'jpg', 'image/jpeg')) + + width = 2400 + if img.size[0] < width: + th = img + else: + th = img.resize((width, round(width * img.size[1] / img.size[0]))) + + file_name = 'image%d.%s' % (i, ext) + th.save(os.path.join(temp, file_name)) + ilustr.set('src', file_name) for sponsor in book_info.sponsors: ins = etree.Element("data-sponsor", name=sponsor) diff --git a/tests/test_html.py b/tests/test_html.py index 36651fa..a3d042d 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -8,6 +8,7 @@ from __future__ import unicode_literals import io from unittest import TestCase from librarian import NoDublinCore +from librarian.builders import builders from librarian.document import WLDocument from librarian.parser import WLDocument as LegacyWLDocument from nose.tools import * @@ -30,7 +31,7 @@ class TransformTest(TestCase): expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html') html = WLDocument( filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') - ).build('html').get_bytes().decode('utf-8') + ).build(builders['html']).get_bytes().decode('utf-8') self.assertEqual(html, io.open(expected_output_file_path).read()) diff --git a/tests/test_text.py b/tests/test_text.py index bdd3ded..4cb2b7b 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,6 +6,7 @@ from __future__ import unicode_literals from librarian import NoDublinCore +from librarian.builders import builders from librarian.parser import WLDocument as LegacyWLDocument from librarian.document import WLDocument from nose.tools import * @@ -27,7 +28,7 @@ def test_transform(): text = WLDocument( filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') - ).build('txt').get_bytes() + ).build(builders['txt']).get_bytes() assert_equal(text, open(expected_output_file_path, 'rb').read()) diff --git a/tox.ini b/tox.ini index 48c35e3..d1fb9b3 100644 --- a/tox.ini +++ b/tox.ini @@ -9,9 +9,10 @@ deps = nose coverage passenv = HOME ; Needed to find locally installed fonts when testing PDF production. +download = true commands = nosetests --with-coverage --cover-package=librarian -d --with-doctest --with-xunit --exe -install_command = pip install --extra-index-url https://py.mdrn.pl/simple {packages} +install_command = pip install numpy; pip install --extra-index-url https://py.mdrn.pl/simple {packages} [testenv:clean] basepython = python3 -- 2.20.1