Images: image@src is a URL, and image sizes are limited.

author Radek Czajka <rczajka@rczajka.pl>

Fri, 5 Feb 2021 21:13:41 +0000 (22:13 +0100)

committer Radek Czajka <rczajka@rczajka.pl>

Fri, 5 Feb 2021 21:13:41 +0000 (22:13 +0100)
author Radek Czajka <rczajka@rczajka.pl>
Fri, 5 Feb 2021 21:13:41 +0000 (22:13 +0100)
committer Radek Czajka <rczajka@rczajka.pl>
Fri, 5 Feb 2021 21:13:41 +0000 (22:13 +0100)
diff --git a/CHANGELOG.md b/CHANGELOG.md

index 6f17bfb..4efa62a 100644 (file)
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@
  This document records all notable changes to Librarian.
  
  
+## 1.14 (2021-02-05)
+
+### Changed
+- Image sources are now URLs. This changes the API: instead of paths
+  given as `ilustr_path`, `transform` functions now accept
+  a new `base_url` parameter.
+- Size limits introduced for images in all formats.
+
+
  ## 1.13 (2021-01-27)
  
  ### Changed
diff --git a/scripts/book2epub b/scripts/book2epub

index c1027c5..f4d5617 100755 (executable)
--- a/scripts/book2epub
+++ b/scripts/book2epub
@@ -17,8 +17,15 @@ class Book2Epub(Book2Anything):
      transform_flags = [
          Option('-w', '--working-copy', dest='working-copy',
                  action='store_true', default=False,
-                help='mark the output as a working copy')
-        ]
+                help='mark the output as a working copy'
+        )
+    ]
+    transform_options = [
+        Option(
+            '-b', '--base-url', dest='base_url', metavar='URL',
+            help='specifies the base URL for relative image references'
+        ),
+    ]
  
  
  if __name__ == '__main__':
diff --git a/scripts/book2html b/scripts/book2html

index 02d542c..66b8999 100755 (executable)
--- a/scripts/book2html
+++ b/scripts/book2html
@@ -20,7 +20,11 @@ class Book2Html(Book2Anything):
                  help='output raw text for use in templates')
      ]
      transform_options = [
-        Option('--css', dest='css')
+        Option('--css', dest='css'),
+        Option(
+            '-b', '--base-url', dest='base_url', metavar='URL',
+            help='specifies the base URL for relative image references'
+        ),
      ]
      parser_options = [
          Option('-i', '--ignore-dublin-core', dest='parse_dublincore', 
diff --git a/scripts/book2mobi b/scripts/book2mobi

index b0d0686..174ef57 100755 (executable)
--- a/scripts/book2mobi
+++ b/scripts/book2mobi
@@ -17,10 +17,16 @@ class Book2Mobi(Book2Anything):
      uses_provider = True
  
      transform_options = [
-        Option('-k', '--use-kindlegen',
-                action='store_true', dest='use_kindlegen', default=False,
-                help='use kindlegen tool instead of Calibre')
-        ]
+        Option(
+            '-k', '--use-kindlegen',
+            action='store_true', dest='use_kindlegen', default=False,
+            help='use kindlegen tool instead of Calibre'
+        ),
+        Option(
+            '-b', '--base-url', dest='base_url', metavar='URL',
+            help='specifies the base URL for relative image references'
+        ),
+    ]
  
  
  if __name__ == '__main__':
diff --git a/scripts/book2pdf b/scripts/book2pdf

index 3c363f1..b3f422e 100755 (executable)
--- a/scripts/book2pdf
+++ b/scripts/book2pdf
@@ -15,10 +15,18 @@ class Book2Pdf(Book2Anything):
      uses_cover = True
      uses_provider = True
      transform_options = [
-        Option('-t', '--save-tex', dest='save_tex', metavar='FILE',
-                help='path to save the intermediary LaTeX file to'),
-        Option('-m', '--morefloats', dest='morefloats', metavar='old/new/none',
-                help='force morefloats in old (<1.0c), new (>=1.0c) or none')
+        Option(
+            '-t', '--save-tex', dest='save_tex', metavar='FILE',
+            help='path to save the intermediary LaTeX file to'
+        ),
+        Option(
+            '-m', '--morefloats', dest='morefloats', metavar='old/new/none',
+            help='force morefloats in old (<1.0c), new (>=1.0c) or none'
+        ),
+        Option(
+            '-b', '--base-url', dest='base_url', metavar='URL',
+            help='specifies the base URL for relative image references'
+        ),
      ]
  
  
diff --git a/setup.py b/setup.py

index c1af4e3..5b6f487 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def whole_tree(prefix, path):
  
  setup(
      name='librarian',
-    version='1.13',
+    version='1.14',
      description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
      author="Marek Stępniowski",
      author_email='marek@stepniowski.com',
diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py

index 38e5338..a117895 100644 (file)
--- a/src/librarian/builders/daisy.py
+++ b/src/librarian/builders/daisy.py
@@ -173,7 +173,7 @@ class DaisyBuilder:
                      **{
                          "clip-begin": "npt=%.3fs" % item[0],
                          "clip-end": "npt=%.3fs" % item[1],
-                    },
+                    }
                  )
  
              zipf.writestr(
diff --git a/src/librarian/builders/html.py b/src/librarian/builders/html.py

index ee50cb8..5096e28 100644 (file)
--- a/src/librarian/builders/html.py
+++ b/src/librarian/builders/html.py
@@ -19,8 +19,8 @@ class HtmlBuilder:
      with_nota_red = True
      no_externalities = False
  
-    def __init__(self, image_location='https://wolnelektury.pl/media/book/pictures/marcos-historia-kolorow/'):
-        self.image_location = image_location
+    def __init__(self, base_url=None):
+        self._base_url = base_url
  
          self.tree = text = etree.Element('div', **{'id': 'book-text'})
          self.header = etree.SubElement(text, 'h1')
@@ -38,6 +38,13 @@ class HtmlBuilder:
          }
          self.current_cursors = [text]
  
+    @property
+    def base_url(self):
+        if self._base_url is not None:
+            return self._base_url
+        else:
+            return 'https://wolnelektury.pl/media/book/pictures/{}/'.format(self.document.meta.url.slug)
+
      @property
      def cursor(self):
          return self.current_cursors[-1]
@@ -60,6 +67,8 @@ class HtmlBuilder:
          document._compat_assign_section_ids()
  
      def build(self, document, **kwargs):
+        self.document = document
+
          self.preprocess(document)
          document.tree.getroot().html_build(self)
          self.postprocess(document)
diff --git a/src/librarian/command_line.py b/src/librarian/command_line.py

index e7021bb..2fc7cc1 100644 (file)
--- a/src/librarian/command_line.py
+++ b/src/librarian/command_line.py
@@ -22,13 +22,19 @@ def main(*args, **kwargs):
          help='specifies the directory for output'
      )
  
+    # Specific 
+    parser.add_argument(
+        '-b', '--base-url', metavar="URL",
+        help="Base for relative URLs in documents (like image sources)"
+    )
+
      parser.add_argument(
          '--mp3',
          metavar="FILE",
          nargs="*",
          help='specifies an MP3 file, if needed'
      )
-    
+
      args = parser.parse_args()
      builder = builders[args.builder]
  
diff --git a/src/librarian/document.py b/src/librarian/document.py

index 1c8f223..6e94ff2 100644 (file)
--- a/src/librarian/document.py
+++ b/src/librarian/document.py
@@ -1,8 +1,8 @@
  import gettext
  import os
  import re
-from urllib.request import urlopen
  from lxml import etree
+import six
  from .parser import parser
  from . import dcparser, DCNS
  from .functions import lang_code_3to2
@@ -10,7 +10,7 @@ from .functions import lang_code_3to2
  
  class WLDocument:
      def __init__(self, filename=None, url=None):
-        source = filename or urlopen(url)
+        source = filename or six.moves.urllib.request.urlopen(url)
          tree = etree.parse(source, parser=parser)
          self.tree = tree
          tree.getroot().document = self
diff --git a/src/librarian/elements/figures/ilustr.py b/src/librarian/elements/figures/ilustr.py

index ee82b08..3c3026c 100644 (file)
--- a/src/librarian/elements/figures/ilustr.py
+++ b/src/librarian/elements/figures/ilustr.py
@@ -6,7 +6,7 @@ class Ilustr(WLElement):
  
      def get_html_attr(self, builder):
          return {
-            'src': builder.image_location + self.attrib['src'],
+            'src': builder.base_url + self.attrib['src'],
              'alt': self.attrib['alt'],
              'title': self.attrib['alt'],
          }
diff --git a/src/librarian/elements/footnotes/__init__.py b/src/librarian/elements/footnotes/__init__.py

index d0a0bf1..0f30747 100644 (file)
--- a/src/librarian/elements/footnotes/__init__.py
+++ b/src/librarian/elements/footnotes/__init__.py
@@ -1,3 +1,6 @@
+# -*- coding: utf-8
+from __future__ import unicode_literals
+
  from ..base import WLElement
  
  
diff --git a/src/librarian/epub.py b/src/librarian/epub.py

index fc5ee16..5a88d5a 100644 (file)
--- a/src/librarian/epub.py
+++ b/src/librarian/epub.py
@@ -15,6 +15,7 @@ from mimetypes import guess_type
  
  from ebooklib import epub
  from lxml import etree
+from PIL import Image
  from tempfile import mkdtemp, NamedTemporaryFile
  from shutil import rmtree
  
@@ -378,7 +379,7 @@ def remove_empty_lists_from_toc(toc):
  
  def transform(wldoc, verbose=False, style=None,
                sample=None, cover=None, flags=None, hyphenate=False,
-              ilustr_path='', output_type='epub'):
+              base_url='file://./', output_type='epub'):
      """ produces a EPUB file
  
      sample=n: generate sample e-book (with at least n paragraphs)
@@ -567,23 +568,40 @@ def transform(wldoc, verbose=False, style=None,
  
      functions.reg_mathml_epub(output)
  
-    if os.path.isdir(ilustr_path):
-        ilustr_elements = set(ilustr.get('src')
-                              for ilustr in document.edoc.findall('//ilustr'))
-        for i, filename in enumerate(os.listdir(ilustr_path)):
-            if filename not in ilustr_elements:
-                continue
-            file_path = os.path.join(ilustr_path, filename)
-            with open(file_path, 'rb') as f:
-                output.add_item(
-                    epub.EpubItem(
-                        uid='image%s' % i,
-                        file_name=filename,
-                        media_type=guess_type(file_path)[0],
-                        content=f.read()
-                    )
-                )
+    # FIXME
+    for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
+        url = six.moves.urllib.parse.urljoin(
+            base_url,
+            ilustr.get('src')
+        )
+        with six.moves.urllib.request.urlopen(url) as imgfile:
+            img = Image.open(imgfile)
+
+        th_format, ext, media_type = {
+            'GIF': ('GIF', 'gif', 'image/gif'),
+            'PNG': ('PNG', 'png', 'image/png'),
+        }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
  
+        width = 1200
+        if img.size[0] < width:
+            th = img
+        else:
+            th = img.resize((width, round(width * img.size[1] / img.size[0])))
+
+        buffer = six.BytesIO()
+        th.save(buffer, format=th_format)
+
+        file_name = 'image%d.%s' % (i, ext)
+        ilustr.set('src', file_name)
+        output.add_item(
+            epub.EpubItem(
+                uid='image%s' % i,
+                file_name=file_name,
+                media_type=media_type,
+                content=buffer.getvalue()
+            )
+        )
+            
      # write static elements
  
      with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
diff --git a/src/librarian/html.py b/src/librarian/html.py

index d262198..363286c 100644 (file)
--- a/src/librarian/html.py
+++ b/src/librarian/html.py
@@ -51,32 +51,45 @@ def transform_abstrakt(abstrakt_element):
      return re.sub('</?blockquote[^>]*>', '', html)
  
  
-def add_image_sizes(tree, gallery_path, gallery_url):
-    widths = [360, 600, 1200, 1800]
-    for ilustr in tree.findall('//ilustr'):
+def add_image_sizes(tree, gallery_path, gallery_url, base_url):
+    widths = [360, 600, 1200, 1800, 2400]
+
+    for i, ilustr in enumerate(tree.findall('//ilustr')):
          rel_path = ilustr.attrib['src']
-        img = Image.open(gallery_path + rel_path)
+        img_url = six.moves.urllib.parse.urljoin(base_url, rel_path)
+
+        with six.moves.urllib.request.urlopen(img_url) as f:
+            img = Image.open(f)
+
+        ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
+
          srcset = []
+        # Needed widths: predefined and original, limited by
+        # whichever is smaller.
+        img_widths = [
+            w for w in
+            sorted(
+                set(widths + [img.size[0]])
+            )
+            if w <= min(widths[-1], img.size[0])
+        ]
+        largest = None
          for w in widths:
-            if w < img.size[0]:
-                height = round(img.size[1] * w / img.size[0])
-                th = img.resize((w, height))
-
-                fname = ('.W%d.' % w).join(rel_path.rsplit('.', 1))
-                th.save(gallery_path + fname)
-                srcset.append(" ".join((
-                    gallery_url + fname,
-                    '%dw' % w
-                    )))
-        srcset.append(" ".join((
-            gallery_url + rel_path,
-            '%dw' % img.size[0]
-        )))
+            height = round(img.size[1] * w / img.size[0])
+            th = img.resize((w, height))
+            fname = '%d.W%d.%s' % (i, w, ext)
+            th.save(gallery_path + fname)
+            th_url = gallery_url + fname
+            srcset.append(" ".join((
+                th_url,
+                '%dw' % w
+            )))
+            largest_url = th_url
          ilustr.attrib['srcset'] = ", ".join(srcset)
-        ilustr.attrib['src'] = gallery_url + rel_path
+        ilustr.attrib['src'] = largest_url
  
  
-def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/'):
+def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
      """Transforms the WL document to XHTML.
  
      If output_filename is None, returns an XML,
@@ -102,7 +115,12 @@ def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, ga
          if not options:
              options = {}
  
-        add_image_sizes(document.edoc, gallery_path, gallery_url)
+        try:
+            os.makedirs(gallery_path)
+        except OSError:
+            pass
+
+        add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
  
          css = (
              css
diff --git a/src/librarian/mobi.py b/src/librarian/mobi.py

index 337db75..d6c2515 100644 (file)
--- a/src/librarian/mobi.py
+++ b/src/librarian/mobi.py
@@ -14,7 +14,7 @@ from librarian import OutputFile
  
  
  def transform(wldoc, verbose=False, sample=None, cover=None,
-              use_kindlegen=False, flags=None, hyphenate=True, ilustr_path='',
+              use_kindlegen=False, flags=None, hyphenate=True, base_url='',
                converter_path=None):
      """ produces a MOBI file
  
@@ -31,7 +31,7 @@ def transform(wldoc, verbose=False, sample=None, cover=None,
  
      epub = document.as_epub(verbose=verbose, sample=sample,
                              cover=cover or True, flags=flags,
-                            hyphenate=hyphenate, ilustr_path=ilustr_path,
+                            hyphenate=hyphenate, base_url=base_url,
                              output_type='mobi')
      if verbose:
          kwargs = {}
diff --git a/src/librarian/pdf.py b/src/librarian/pdf.py

index a51dbb5..cad66a4 100644 (file)
--- a/src/librarian/pdf.py
+++ b/src/librarian/pdf.py
@@ -20,6 +20,7 @@ from copy import deepcopy
  from subprocess import call, PIPE
  from itertools import chain
  
+from PIL import Image
  from Texml.processor import process
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
@@ -242,7 +243,7 @@ def package_available(package, args='', verbose=False):
  
  
  def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
-              cover=None, flags=None, customizations=None, ilustr_path='',
+              cover=None, flags=None, customizations=None, base_url='file://./',
                latex_dir=False):
      """ produces a PDF file with XeLaTeX
  
@@ -314,8 +315,28 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
          # TeXML -> LaTeX
          temp = mkdtemp('-wl2pdf')
  
-        for ilustr in document.edoc.findall("//ilustr"):
-            shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
+        for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
+            url = six.moves.urllib.parse.urljoin(
+                base_url,
+                ilustr.get('src')
+            )
+            with six.moves.urllib.request.urlopen(url) as imgfile:
+                img = Image.open(imgfile)
+
+            th_format, ext, media_type = {
+                'GIF': ('GIF', 'gif', 'image/gif'),
+                'PNG': ('PNG', 'png', 'image/png'),
+            }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
+
+            width = 2400
+            if img.size[0] < width:
+                th = img
+            else:
+                th = img.resize((width, round(width * img.size[1] / img.size[0])))
+
+            file_name = 'image%d.%s' % (i, ext)
+            th.save(os.path.join(temp, file_name))
+            ilustr.set('src', file_name)
  
          for sponsor in book_info.sponsors:
              ins = etree.Element("data-sponsor", name=sponsor)
diff --git a/tests/test_html.py b/tests/test_html.py

index 36651fa..a3d042d 100644 (file)
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -8,6 +8,7 @@ from __future__ import unicode_literals
  import io
  from unittest import TestCase
  from librarian import NoDublinCore
+from librarian.builders import builders
  from librarian.document import WLDocument
  from librarian.parser import WLDocument as LegacyWLDocument
  from nose.tools import *
@@ -30,7 +31,7 @@ class TransformTest(TestCase):
          expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html')
          html = WLDocument(
              filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
-        ).build('html').get_bytes().decode('utf-8')
+        ).build(builders['html']).get_bytes().decode('utf-8')
  
          self.assertEqual(html, io.open(expected_output_file_path).read())
  
diff --git a/tests/test_text.py b/tests/test_text.py

index bdd3ded..4cb2b7b 100644 (file)
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -6,6 +6,7 @@
  from __future__ import unicode_literals
  
  from librarian import NoDublinCore
+from librarian.builders import builders
  from librarian.parser import WLDocument as LegacyWLDocument
  from librarian.document import WLDocument
  from nose.tools import *
@@ -27,7 +28,7 @@ def test_transform():
  
      text = WLDocument(
          filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
-    ).build('txt').get_bytes()
+    ).build(builders['txt']).get_bytes()
  
      assert_equal(text, open(expected_output_file_path, 'rb').read())
  
diff --git a/tox.ini b/tox.ini

index 48c35e3..d1fb9b3 100644 (file)
--- a/tox.ini
+++ b/tox.ini
@@ -9,9 +9,10 @@ deps =
      nose
      coverage
  passenv = HOME  ; Needed to find locally installed fonts when testing PDF production.
+download = true
  commands =
      nosetests --with-coverage --cover-package=librarian -d --with-doctest --with-xunit --exe
-install_command = pip install --extra-index-url https://py.mdrn.pl/simple {packages}
+install_command = pip install numpy; pip install --extra-index-url https://py.mdrn.pl/simple {packages}
  
  [testenv:clean]
  basepython = python3
author	Radek Czajka <rczajka@rczajka.pl>
	Fri, 5 Feb 2021 21:13:41 +0000 (22:13 +0100)
committer	Radek Czajka <rczajka@rczajka.pl>
	Fri, 5 Feb 2021 21:13:41 +0000 (22:13 +0100)
CHANGELOG.md		patch \| blob \| history
scripts/book2epub		patch \| blob \| history
scripts/book2html		patch \| blob \| history
scripts/book2mobi		patch \| blob \| history
scripts/book2pdf		patch \| blob \| history
setup.py		patch \| blob \| history
src/librarian/builders/daisy.py		patch \| blob \| history
src/librarian/builders/html.py		patch \| blob \| history
src/librarian/command_line.py		patch \| blob \| history
src/librarian/document.py		patch \| blob \| history
src/librarian/elements/figures/ilustr.py		patch \| blob \| history
src/librarian/elements/footnotes/__init__.py		patch \| blob \| history
src/librarian/epub.py		patch \| blob \| history
src/librarian/html.py		patch \| blob \| history
src/librarian/mobi.py		patch \| blob \| history
src/librarian/pdf.py		patch \| blob \| history
tests/test_html.py		patch \| blob \| history
tests/test_text.py		patch \| blob \| history
tox.ini		patch \| blob \| history