From 83cae63af4330912cdb2546c195af2919afd30ac Mon Sep 17 00:00:00 2001
From: Radek Czajka <rczajka@rczajka.pl>
Date: Fri, 5 Feb 2021 22:13:41 +0100
Subject: [PATCH] Images: image@src is a URL, and image sizes are limited.

---
 CHANGELOG.md                                 |  9 +++
 scripts/book2epub                            | 11 +++-
 scripts/book2html                            |  6 +-
 scripts/book2mobi                            | 14 +++--
 scripts/book2pdf                             | 16 ++++--
 setup.py                                     |  2 +-
 src/librarian/builders/daisy.py              |  2 +-
 src/librarian/builders/html.py               | 13 ++++-
 src/librarian/command_line.py                |  8 ++-
 src/librarian/document.py                    |  4 +-
 src/librarian/elements/figures/ilustr.py     |  2 +-
 src/librarian/elements/footnotes/__init__.py |  3 +
 src/librarian/epub.py                        | 52 +++++++++++------
 src/librarian/html.py                        | 60 +++++++++++++-------
 src/librarian/mobi.py                        |  4 +-
 src/librarian/pdf.py                         | 27 ++++++++-
 tests/test_html.py                           |  3 +-
 tests/test_text.py                           |  3 +-
 tox.ini                                      |  3 +-
 19 files changed, 177 insertions(+), 65 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6f17bfb..4efa62a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@
 This document records all notable changes to Librarian.
 
 
+## 1.14 (2021-02-05)
+
+### Changed
+- Image sources are now URLs. This changes the API: instead of paths
+  given as `ilustr_path`, `transform` functions now accept
+  a new `base_url` parameter.
+- Size limits introduced for images in all formats.
+
+
 ## 1.13 (2021-01-27)
 
 ### Changed
diff --git a/scripts/book2epub b/scripts/book2epub
index c1027c5..f4d5617 100755
--- a/scripts/book2epub
+++ b/scripts/book2epub
@@ -17,8 +17,15 @@ class Book2Epub(Book2Anything):
     transform_flags = [
         Option('-w', '--working-copy', dest='working-copy',
                 action='store_true', default=False,
-                help='mark the output as a working copy')
-        ]
+                help='mark the output as a working copy'
+        )
+    ]
+    transform_options = [
+        Option(
+            '-b', '--base-url', dest='base_url', metavar='URL',
+            help='specifies the base URL for relative image references'
+        ),
+    ]
 
 
 if __name__ == '__main__':
diff --git a/scripts/book2html b/scripts/book2html
index 02d542c..66b8999 100755
--- a/scripts/book2html
+++ b/scripts/book2html
@@ -20,7 +20,11 @@ class Book2Html(Book2Anything):
                 help='output raw text for use in templates')
     ]
     transform_options = [
-        Option('--css', dest='css')
+        Option('--css', dest='css'),
+        Option(
+            '-b', '--base-url', dest='base_url', metavar='URL',
+            help='specifies the base URL for relative image references'
+        ),
     ]
     parser_options = [
         Option('-i', '--ignore-dublin-core', dest='parse_dublincore', 
diff --git a/scripts/book2mobi b/scripts/book2mobi
index b0d0686..174ef57 100755
--- a/scripts/book2mobi
+++ b/scripts/book2mobi
@@ -17,10 +17,16 @@ class Book2Mobi(Book2Anything):
     uses_provider = True
 
     transform_options = [
-        Option('-k', '--use-kindlegen',
-                action='store_true', dest='use_kindlegen', default=False,
-                help='use kindlegen tool instead of Calibre')
-        ]
+        Option(
+            '-k', '--use-kindlegen',
+            action='store_true', dest='use_kindlegen', default=False,
+            help='use kindlegen tool instead of Calibre'
+        ),
+        Option(
+            '-b', '--base-url', dest='base_url', metavar='URL',
+            help='specifies the base URL for relative image references'
+        ),
+    ]
 
 
 if __name__ == '__main__':
diff --git a/scripts/book2pdf b/scripts/book2pdf
index 3c363f1..b3f422e 100755
--- a/scripts/book2pdf
+++ b/scripts/book2pdf
@@ -15,10 +15,18 @@ class Book2Pdf(Book2Anything):
     uses_cover = True
     uses_provider = True
     transform_options = [
-        Option('-t', '--save-tex', dest='save_tex', metavar='FILE',
-                help='path to save the intermediary LaTeX file to'),
-        Option('-m', '--morefloats', dest='morefloats', metavar='old/new/none',
-                help='force morefloats in old (<1.0c), new (>=1.0c) or none')
+        Option(
+            '-t', '--save-tex', dest='save_tex', metavar='FILE',
+            help='path to save the intermediary LaTeX file to'
+        ),
+        Option(
+            '-m', '--morefloats', dest='morefloats', metavar='old/new/none',
+            help='force morefloats in old (<1.0c), new (>=1.0c) or none'
+        ),
+        Option(
+            '-b', '--base-url', dest='base_url', metavar='URL',
+            help='specifies the base URL for relative image references'
+        ),
     ]
 
 
diff --git a/setup.py b/setup.py
index c1af4e3..5b6f487 100755
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def whole_tree(prefix, path):
 
 setup(
     name='librarian',
-    version='1.13',
+    version='1.14',
     description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
     author="Marek StÄpniowski",
     author_email='marek@stepniowski.com',
diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py
index 38e5338..a117895 100644
--- a/src/librarian/builders/daisy.py
+++ b/src/librarian/builders/daisy.py
@@ -173,7 +173,7 @@ class DaisyBuilder:
                     **{
                         "clip-begin": "npt=%.3fs" % item[0],
                         "clip-end": "npt=%.3fs" % item[1],
-                    },
+                    }
                 )
 
             zipf.writestr(
diff --git a/src/librarian/builders/html.py b/src/librarian/builders/html.py
index ee50cb8..5096e28 100644
--- a/src/librarian/builders/html.py
+++ b/src/librarian/builders/html.py
@@ -19,8 +19,8 @@ class HtmlBuilder:
     with_nota_red = True
     no_externalities = False
 
-    def __init__(self, image_location='https://wolnelektury.pl/media/book/pictures/marcos-historia-kolorow/'):
-        self.image_location = image_location
+    def __init__(self, base_url=None):
+        self._base_url = base_url
 
         self.tree = text = etree.Element('div', **{'id': 'book-text'})
         self.header = etree.SubElement(text, 'h1')
@@ -38,6 +38,13 @@ class HtmlBuilder:
         }
         self.current_cursors = [text]
 
+    @property
+    def base_url(self):
+        if self._base_url is not None:
+            return self._base_url
+        else:
+            return 'https://wolnelektury.pl/media/book/pictures/{}/'.format(self.document.meta.url.slug)
+
     @property
     def cursor(self):
         return self.current_cursors[-1]
@@ -60,6 +67,8 @@ class HtmlBuilder:
         document._compat_assign_section_ids()
 
     def build(self, document, **kwargs):
+        self.document = document
+
         self.preprocess(document)
         document.tree.getroot().html_build(self)
         self.postprocess(document)
diff --git a/src/librarian/command_line.py b/src/librarian/command_line.py
index e7021bb..2fc7cc1 100644
--- a/src/librarian/command_line.py
+++ b/src/librarian/command_line.py
@@ -22,13 +22,19 @@ def main(*args, **kwargs):
         help='specifies the directory for output'
     )
 
+    # Specific 
+    parser.add_argument(
+        '-b', '--base-url', metavar="URL",
+        help="Base for relative URLs in documents (like image sources)"
+    )
+
     parser.add_argument(
         '--mp3',
         metavar="FILE",
         nargs="*",
         help='specifies an MP3 file, if needed'
     )
-    
+
     args = parser.parse_args()
     builder = builders[args.builder]
 
diff --git a/src/librarian/document.py b/src/librarian/document.py
index 1c8f223..6e94ff2 100644
--- a/src/librarian/document.py
+++ b/src/librarian/document.py
@@ -1,8 +1,8 @@
 import gettext
 import os
 import re
-from urllib.request import urlopen
 from lxml import etree
+import six
 from .parser import parser
 from . import dcparser, DCNS
 from .functions import lang_code_3to2
@@ -10,7 +10,7 @@ from .functions import lang_code_3to2
 
 class WLDocument:
     def __init__(self, filename=None, url=None):
-        source = filename or urlopen(url)
+        source = filename or six.moves.urllib.request.urlopen(url)
         tree = etree.parse(source, parser=parser)
         self.tree = tree
         tree.getroot().document = self
diff --git a/src/librarian/elements/figures/ilustr.py b/src/librarian/elements/figures/ilustr.py
index ee82b08..3c3026c 100644
--- a/src/librarian/elements/figures/ilustr.py
+++ b/src/librarian/elements/figures/ilustr.py
@@ -6,7 +6,7 @@ class Ilustr(WLElement):
 
     def get_html_attr(self, builder):
         return {
-            'src': builder.image_location + self.attrib['src'],
+            'src': builder.base_url + self.attrib['src'],
             'alt': self.attrib['alt'],
             'title': self.attrib['alt'],
         }
diff --git a/src/librarian/elements/footnotes/__init__.py b/src/librarian/elements/footnotes/__init__.py
index d0a0bf1..0f30747 100644
--- a/src/librarian/elements/footnotes/__init__.py
+++ b/src/librarian/elements/footnotes/__init__.py
@@ -1,3 +1,6 @@
+# -*- coding: utf-8
+from __future__ import unicode_literals
+
 from ..base import WLElement
 
 
diff --git a/src/librarian/epub.py b/src/librarian/epub.py
index fc5ee16..5a88d5a 100644
--- a/src/librarian/epub.py
+++ b/src/librarian/epub.py
@@ -15,6 +15,7 @@ from mimetypes import guess_type
 
 from ebooklib import epub
 from lxml import etree
+from PIL import Image
 from tempfile import mkdtemp, NamedTemporaryFile
 from shutil import rmtree
 
@@ -378,7 +379,7 @@ def remove_empty_lists_from_toc(toc):
 
 def transform(wldoc, verbose=False, style=None,
               sample=None, cover=None, flags=None, hyphenate=False,
-              ilustr_path='', output_type='epub'):
+              base_url='file://./', output_type='epub'):
     """ produces a EPUB file
 
     sample=n: generate sample e-book (with at least n paragraphs)
@@ -567,23 +568,40 @@ def transform(wldoc, verbose=False, style=None,
 
     functions.reg_mathml_epub(output)
 
-    if os.path.isdir(ilustr_path):
-        ilustr_elements = set(ilustr.get('src')
-                              for ilustr in document.edoc.findall('//ilustr'))
-        for i, filename in enumerate(os.listdir(ilustr_path)):
-            if filename not in ilustr_elements:
-                continue
-            file_path = os.path.join(ilustr_path, filename)
-            with open(file_path, 'rb') as f:
-                output.add_item(
-                    epub.EpubItem(
-                        uid='image%s' % i,
-                        file_name=filename,
-                        media_type=guess_type(file_path)[0],
-                        content=f.read()
-                    )
-                )
+    # FIXME
+    for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
+        url = six.moves.urllib.parse.urljoin(
+            base_url,
+            ilustr.get('src')
+        )
+        with six.moves.urllib.request.urlopen(url) as imgfile:
+            img = Image.open(imgfile)
+
+        th_format, ext, media_type = {
+            'GIF': ('GIF', 'gif', 'image/gif'),
+            'PNG': ('PNG', 'png', 'image/png'),
+        }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
 
+        width = 1200
+        if img.size[0] < width:
+            th = img
+        else:
+            th = img.resize((width, round(width * img.size[1] / img.size[0])))
+
+        buffer = six.BytesIO()
+        th.save(buffer, format=th_format)
+
+        file_name = 'image%d.%s' % (i, ext)
+        ilustr.set('src', file_name)
+        output.add_item(
+            epub.EpubItem(
+                uid='image%s' % i,
+                file_name=file_name,
+                media_type=media_type,
+                content=buffer.getvalue()
+            )
+        )
+            
     # write static elements
 
     with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
diff --git a/src/librarian/html.py b/src/librarian/html.py
index d262198..363286c 100644
--- a/src/librarian/html.py
+++ b/src/librarian/html.py
@@ -51,32 +51,45 @@ def transform_abstrakt(abstrakt_element):
     return re.sub('</?blockquote[^>]*>', '', html)
 
 
-def add_image_sizes(tree, gallery_path, gallery_url):
-    widths = [360, 600, 1200, 1800]
-    for ilustr in tree.findall('//ilustr'):
+def add_image_sizes(tree, gallery_path, gallery_url, base_url):
+    widths = [360, 600, 1200, 1800, 2400]
+
+    for i, ilustr in enumerate(tree.findall('//ilustr')):
         rel_path = ilustr.attrib['src']
-        img = Image.open(gallery_path + rel_path)
+        img_url = six.moves.urllib.parse.urljoin(base_url, rel_path)
+
+        with six.moves.urllib.request.urlopen(img_url) as f:
+            img = Image.open(f)
+
+        ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
+
         srcset = []
+        # Needed widths: predefined and original, limited by
+        # whichever is smaller.
+        img_widths = [
+            w for w in
+            sorted(
+                set(widths + [img.size[0]])
+            )
+            if w <= min(widths[-1], img.size[0])
+        ]
+        largest = None
         for w in widths:
-            if w < img.size[0]:
-                height = round(img.size[1] * w / img.size[0])
-                th = img.resize((w, height))
-
-                fname = ('.W%d.' % w).join(rel_path.rsplit('.', 1))
-                th.save(gallery_path + fname)
-                srcset.append(" ".join((
-                    gallery_url + fname,
-                    '%dw' % w
-                    )))
-        srcset.append(" ".join((
-            gallery_url + rel_path,
-            '%dw' % img.size[0]
-        )))
+            height = round(img.size[1] * w / img.size[0])
+            th = img.resize((w, height))
+            fname = '%d.W%d.%s' % (i, w, ext)
+            th.save(gallery_path + fname)
+            th_url = gallery_url + fname
+            srcset.append(" ".join((
+                th_url,
+                '%dw' % w
+            )))
+            largest_url = th_url
         ilustr.attrib['srcset'] = ", ".join(srcset)
-        ilustr.attrib['src'] = gallery_url + rel_path
+        ilustr.attrib['src'] = largest_url
 
 
-def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/'):
+def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
     """Transforms the WL document to XHTML.
 
     If output_filename is None, returns an XML,
@@ -102,7 +115,12 @@ def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, ga
         if not options:
             options = {}
 
-        add_image_sizes(document.edoc, gallery_path, gallery_url)
+        try:
+            os.makedirs(gallery_path)
+        except OSError:
+            pass
+
+        add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
 
         css = (
             css
diff --git a/src/librarian/mobi.py b/src/librarian/mobi.py
index 337db75..d6c2515 100644
--- a/src/librarian/mobi.py
+++ b/src/librarian/mobi.py
@@ -14,7 +14,7 @@ from librarian import OutputFile
 
 
 def transform(wldoc, verbose=False, sample=None, cover=None,
-              use_kindlegen=False, flags=None, hyphenate=True, ilustr_path='',
+              use_kindlegen=False, flags=None, hyphenate=True, base_url='',
               converter_path=None):
     """ produces a MOBI file
 
@@ -31,7 +31,7 @@ def transform(wldoc, verbose=False, sample=None, cover=None,
 
     epub = document.as_epub(verbose=verbose, sample=sample,
                             cover=cover or True, flags=flags,
-                            hyphenate=hyphenate, ilustr_path=ilustr_path,
+                            hyphenate=hyphenate, base_url=base_url,
                             output_type='mobi')
     if verbose:
         kwargs = {}
diff --git a/src/librarian/pdf.py b/src/librarian/pdf.py
index a51dbb5..cad66a4 100644
--- a/src/librarian/pdf.py
+++ b/src/librarian/pdf.py
@@ -20,6 +20,7 @@ from copy import deepcopy
 from subprocess import call, PIPE
 from itertools import chain
 
+from PIL import Image
 from Texml.processor import process
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
@@ -242,7 +243,7 @@ def package_available(package, args='', verbose=False):
 
 
 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
-              cover=None, flags=None, customizations=None, ilustr_path='',
+              cover=None, flags=None, customizations=None, base_url='file://./',
               latex_dir=False):
     """ produces a PDF file with XeLaTeX
 
@@ -314,8 +315,28 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
         # TeXML -> LaTeX
         temp = mkdtemp('-wl2pdf')
 
-        for ilustr in document.edoc.findall("//ilustr"):
-            shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
+        for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
+            url = six.moves.urllib.parse.urljoin(
+                base_url,
+                ilustr.get('src')
+            )
+            with six.moves.urllib.request.urlopen(url) as imgfile:
+                img = Image.open(imgfile)
+
+            th_format, ext, media_type = {
+                'GIF': ('GIF', 'gif', 'image/gif'),
+                'PNG': ('PNG', 'png', 'image/png'),
+            }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
+
+            width = 2400
+            if img.size[0] < width:
+                th = img
+            else:
+                th = img.resize((width, round(width * img.size[1] / img.size[0])))
+
+            file_name = 'image%d.%s' % (i, ext)
+            th.save(os.path.join(temp, file_name))
+            ilustr.set('src', file_name)
 
         for sponsor in book_info.sponsors:
             ins = etree.Element("data-sponsor", name=sponsor)
diff --git a/tests/test_html.py b/tests/test_html.py
index 36651fa..a3d042d 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -8,6 +8,7 @@ from __future__ import unicode_literals
 import io
 from unittest import TestCase
 from librarian import NoDublinCore
+from librarian.builders import builders
 from librarian.document import WLDocument
 from librarian.parser import WLDocument as LegacyWLDocument
 from nose.tools import *
@@ -30,7 +31,7 @@ class TransformTest(TestCase):
         expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html')
         html = WLDocument(
             filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
-        ).build('html').get_bytes().decode('utf-8')
+        ).build(builders['html']).get_bytes().decode('utf-8')
 
         self.assertEqual(html, io.open(expected_output_file_path).read())
 
diff --git a/tests/test_text.py b/tests/test_text.py
index bdd3ded..4cb2b7b 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -6,6 +6,7 @@
 from __future__ import unicode_literals
 
 from librarian import NoDublinCore
+from librarian.builders import builders
 from librarian.parser import WLDocument as LegacyWLDocument
 from librarian.document import WLDocument
 from nose.tools import *
@@ -27,7 +28,7 @@ def test_transform():
 
     text = WLDocument(
         filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
-    ).build('txt').get_bytes()
+    ).build(builders['txt']).get_bytes()
 
     assert_equal(text, open(expected_output_file_path, 'rb').read())
 
diff --git a/tox.ini b/tox.ini
index 48c35e3..d1fb9b3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,9 +9,10 @@ deps =
     nose
     coverage
 passenv = HOME  ; Needed to find locally installed fonts when testing PDF production.
+download = true
 commands =
     nosetests --with-coverage --cover-package=librarian -d --with-doctest --with-xunit --exe
-install_command = pip install --extra-index-url https://py.mdrn.pl/simple {packages}
+install_command = pip install numpy; pip install --extra-index-url https://py.mdrn.pl/simple {packages}
 
 [testenv:clean]
 basepython = python3
-- 
2.20.1