This document records all notable changes to Librarian.
+## 1.14 (2021-02-05)
+
+### Changed
+- Image sources are now URLs. This changes the API: instead of paths
+ given as `ilustr_path`, `transform` functions now accept
+ a new `base_url` parameter.
+- Size limits introduced for images in all formats.
+
+
## 1.13 (2021-01-27)
### Changed
transform_flags = [
Option('-w', '--working-copy', dest='working-copy',
action='store_true', default=False,
- help='mark the output as a working copy')
- ]
+ help='mark the output as a working copy'
+ )
+ ]
+ transform_options = [
+ Option(
+ '-b', '--base-url', dest='base_url', metavar='URL',
+ help='specifies the base URL for relative image references'
+ ),
+ ]
if __name__ == '__main__':
help='output raw text for use in templates')
]
transform_options = [
- Option('--css', dest='css')
+ Option('--css', dest='css'),
+ Option(
+ '-b', '--base-url', dest='base_url', metavar='URL',
+ help='specifies the base URL for relative image references'
+ ),
]
parser_options = [
Option('-i', '--ignore-dublin-core', dest='parse_dublincore',
uses_provider = True
transform_options = [
- Option('-k', '--use-kindlegen',
- action='store_true', dest='use_kindlegen', default=False,
- help='use kindlegen tool instead of Calibre')
- ]
+ Option(
+ '-k', '--use-kindlegen',
+ action='store_true', dest='use_kindlegen', default=False,
+ help='use kindlegen tool instead of Calibre'
+ ),
+ Option(
+ '-b', '--base-url', dest='base_url', metavar='URL',
+ help='specifies the base URL for relative image references'
+ ),
+ ]
if __name__ == '__main__':
uses_cover = True
uses_provider = True
transform_options = [
- Option('-t', '--save-tex', dest='save_tex', metavar='FILE',
- help='path to save the intermediary LaTeX file to'),
- Option('-m', '--morefloats', dest='morefloats', metavar='old/new/none',
- help='force morefloats in old (<1.0c), new (>=1.0c) or none')
+ Option(
+ '-t', '--save-tex', dest='save_tex', metavar='FILE',
+ help='path to save the intermediary LaTeX file to'
+ ),
+ Option(
+ '-m', '--morefloats', dest='morefloats', metavar='old/new/none',
+ help='force morefloats in old (<1.0c), new (>=1.0c) or none'
+ ),
+ Option(
+ '-b', '--base-url', dest='base_url', metavar='URL',
+ help='specifies the base URL for relative image references'
+ ),
]
setup(
name='librarian',
- version='1.13',
+ version='1.14',
description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
author="Marek Stępniowski",
author_email='marek@stepniowski.com',
**{
"clip-begin": "npt=%.3fs" % item[0],
"clip-end": "npt=%.3fs" % item[1],
- },
+ }
)
zipf.writestr(
with_nota_red = True
no_externalities = False
- def __init__(self, image_location='https://wolnelektury.pl/media/book/pictures/marcos-historia-kolorow/'):
- self.image_location = image_location
+ def __init__(self, base_url=None):
+ self._base_url = base_url
self.tree = text = etree.Element('div', **{'id': 'book-text'})
self.header = etree.SubElement(text, 'h1')
}
self.current_cursors = [text]
+ @property
+ def base_url(self):
+ if self._base_url is not None:
+ return self._base_url
+ else:
+ return 'https://wolnelektury.pl/media/book/pictures/{}/'.format(self.document.meta.url.slug)
+
@property
def cursor(self):
return self.current_cursors[-1]
document._compat_assign_section_ids()
def build(self, document, **kwargs):
+ self.document = document
+
self.preprocess(document)
document.tree.getroot().html_build(self)
self.postprocess(document)
help='specifies the directory for output'
)
+ # Specific
+ parser.add_argument(
+ '-b', '--base-url', metavar="URL",
+ help="Base for relative URLs in documents (like image sources)"
+ )
+
parser.add_argument(
'--mp3',
metavar="FILE",
nargs="*",
help='specifies an MP3 file, if needed'
)
-
+
args = parser.parse_args()
builder = builders[args.builder]
import gettext
import os
import re
-from urllib.request import urlopen
from lxml import etree
+import six
from .parser import parser
from . import dcparser, DCNS
from .functions import lang_code_3to2
class WLDocument:
def __init__(self, filename=None, url=None):
- source = filename or urlopen(url)
+ source = filename or six.moves.urllib.request.urlopen(url)
tree = etree.parse(source, parser=parser)
self.tree = tree
tree.getroot().document = self
def get_html_attr(self, builder):
return {
- 'src': builder.image_location + self.attrib['src'],
+ 'src': builder.base_url + self.attrib['src'],
'alt': self.attrib['alt'],
'title': self.attrib['alt'],
}
+# -*- coding: utf-8
+from __future__ import unicode_literals
+
from ..base import WLElement
from ebooklib import epub
from lxml import etree
+from PIL import Image
from tempfile import mkdtemp, NamedTemporaryFile
from shutil import rmtree
def transform(wldoc, verbose=False, style=None,
sample=None, cover=None, flags=None, hyphenate=False,
- ilustr_path='', output_type='epub'):
+ base_url='file://./', output_type='epub'):
""" produces a EPUB file
sample=n: generate sample e-book (with at least n paragraphs)
functions.reg_mathml_epub(output)
- if os.path.isdir(ilustr_path):
- ilustr_elements = set(ilustr.get('src')
- for ilustr in document.edoc.findall('//ilustr'))
- for i, filename in enumerate(os.listdir(ilustr_path)):
- if filename not in ilustr_elements:
- continue
- file_path = os.path.join(ilustr_path, filename)
- with open(file_path, 'rb') as f:
- output.add_item(
- epub.EpubItem(
- uid='image%s' % i,
- file_name=filename,
- media_type=guess_type(file_path)[0],
- content=f.read()
- )
- )
+ # FIXME
+ for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
+ url = six.moves.urllib.parse.urljoin(
+ base_url,
+ ilustr.get('src')
+ )
+ with six.moves.urllib.request.urlopen(url) as imgfile:
+ img = Image.open(imgfile)
+
+ th_format, ext, media_type = {
+ 'GIF': ('GIF', 'gif', 'image/gif'),
+ 'PNG': ('PNG', 'png', 'image/png'),
+ }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
+ width = 1200
+ if img.size[0] < width:
+ th = img
+ else:
+ th = img.resize((width, round(width * img.size[1] / img.size[0])))
+
+ buffer = six.BytesIO()
+ th.save(buffer, format=th_format)
+
+ file_name = 'image%d.%s' % (i, ext)
+ ilustr.set('src', file_name)
+ output.add_item(
+ epub.EpubItem(
+ uid='image%s' % i,
+ file_name=file_name,
+ media_type=media_type,
+ content=buffer.getvalue()
+ )
+ )
+
# write static elements
with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
return re.sub('</?blockquote[^>]*>', '', html)
-def add_image_sizes(tree, gallery_path, gallery_url):
- widths = [360, 600, 1200, 1800]
- for ilustr in tree.findall('//ilustr'):
+def add_image_sizes(tree, gallery_path, gallery_url, base_url):
+ widths = [360, 600, 1200, 1800, 2400]
+
+ for i, ilustr in enumerate(tree.findall('//ilustr')):
rel_path = ilustr.attrib['src']
- img = Image.open(gallery_path + rel_path)
+ img_url = six.moves.urllib.parse.urljoin(base_url, rel_path)
+
+ with six.moves.urllib.request.urlopen(img_url) as f:
+ img = Image.open(f)
+
+ ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
+
srcset = []
+ # Needed widths: predefined and original, limited by
+ # whichever is smaller.
+ img_widths = [
+ w for w in
+ sorted(
+ set(widths + [img.size[0]])
+ )
+ if w <= min(widths[-1], img.size[0])
+ ]
+ largest = None
for w in widths:
- if w < img.size[0]:
- height = round(img.size[1] * w / img.size[0])
- th = img.resize((w, height))
-
- fname = ('.W%d.' % w).join(rel_path.rsplit('.', 1))
- th.save(gallery_path + fname)
- srcset.append(" ".join((
- gallery_url + fname,
- '%dw' % w
- )))
- srcset.append(" ".join((
- gallery_url + rel_path,
- '%dw' % img.size[0]
- )))
+ height = round(img.size[1] * w / img.size[0])
+ th = img.resize((w, height))
+ fname = '%d.W%d.%s' % (i, w, ext)
+ th.save(gallery_path + fname)
+ th_url = gallery_url + fname
+ srcset.append(" ".join((
+ th_url,
+ '%dw' % w
+ )))
+ largest_url = th_url
ilustr.attrib['srcset'] = ", ".join(srcset)
- ilustr.attrib['src'] = gallery_url + rel_path
+ ilustr.attrib['src'] = largest_url
-def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/'):
+def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
"""Transforms the WL document to XHTML.
If output_filename is None, returns an XML,
if not options:
options = {}
- add_image_sizes(document.edoc, gallery_path, gallery_url)
+ try:
+ os.makedirs(gallery_path)
+ except OSError:
+ pass
+
+ add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
css = (
css
def transform(wldoc, verbose=False, sample=None, cover=None,
- use_kindlegen=False, flags=None, hyphenate=True, ilustr_path='',
+ use_kindlegen=False, flags=None, hyphenate=True, base_url='',
converter_path=None):
""" produces a MOBI file
epub = document.as_epub(verbose=verbose, sample=sample,
cover=cover or True, flags=flags,
- hyphenate=hyphenate, ilustr_path=ilustr_path,
+ hyphenate=hyphenate, base_url=base_url,
output_type='mobi')
if verbose:
kwargs = {}
from subprocess import call, PIPE
from itertools import chain
+from PIL import Image
from Texml.processor import process
from lxml import etree
from lxml.etree import XMLSyntaxError, XSLTApplyError
def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
- cover=None, flags=None, customizations=None, ilustr_path='',
+ cover=None, flags=None, customizations=None, base_url='file://./',
latex_dir=False):
""" produces a PDF file with XeLaTeX
# TeXML -> LaTeX
temp = mkdtemp('-wl2pdf')
- for ilustr in document.edoc.findall("//ilustr"):
- shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
+ for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
+ url = six.moves.urllib.parse.urljoin(
+ base_url,
+ ilustr.get('src')
+ )
+ with six.moves.urllib.request.urlopen(url) as imgfile:
+ img = Image.open(imgfile)
+
+ th_format, ext, media_type = {
+ 'GIF': ('GIF', 'gif', 'image/gif'),
+ 'PNG': ('PNG', 'png', 'image/png'),
+ }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
+
+ width = 2400
+ if img.size[0] < width:
+ th = img
+ else:
+ th = img.resize((width, round(width * img.size[1] / img.size[0])))
+
+ file_name = 'image%d.%s' % (i, ext)
+ th.save(os.path.join(temp, file_name))
+ ilustr.set('src', file_name)
for sponsor in book_info.sponsors:
ins = etree.Element("data-sponsor", name=sponsor)
import io
from unittest import TestCase
from librarian import NoDublinCore
+from librarian.builders import builders
from librarian.document import WLDocument
from librarian.parser import WLDocument as LegacyWLDocument
from nose.tools import *
expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html')
html = WLDocument(
filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
- ).build('html').get_bytes().decode('utf-8')
+ ).build(builders['html']).get_bytes().decode('utf-8')
self.assertEqual(html, io.open(expected_output_file_path).read())
from __future__ import unicode_literals
from librarian import NoDublinCore
+from librarian.builders import builders
from librarian.parser import WLDocument as LegacyWLDocument
from librarian.document import WLDocument
from nose.tools import *
text = WLDocument(
filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml')
- ).build('txt').get_bytes()
+ ).build(builders['txt']).get_bytes()
assert_equal(text, open(expected_output_file_path, 'rb').read())
nose
coverage
passenv = HOME ; Needed to find locally installed fonts when testing PDF production.
+download = true
commands =
nosetests --with-coverage --cover-package=librarian -d --with-doctest --with-xunit --exe
-install_command = pip install --extra-index-url https://py.mdrn.pl/simple {packages}
+install_command = pip install numpy; pip install --extra-index-url https://py.mdrn.pl/simple {packages}
[testenv:clean]
basepython = python3