From 4e329824f40367945de11d3647396859092f5c2c Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 27 Feb 2019 10:13:41 +0100 Subject: [PATCH] Python 3.4-3.7 support; Added coverter_path argument in mobi.transform; Replaced all `from_string` and `get_string` methods with `from_bytes` and `get_bytes`; Fixed packaging, added a changelog, updated and added more tests, added Tox config. Version bump. --- .gitignore | 3 + AUTHORS.md | 18 +++-- CHANGELOG.md | 20 ++++++ MANIFEST.in | 15 ++++ README.md | 18 ++--- librarian/__init__.py | 69 +++++++++---------- librarian/book2anything.py | 15 ++-- librarian/cover.py | 32 +++++---- librarian/dcparser.py | 53 +++++++------- librarian/embeds/__init__.py | 2 + librarian/embeds/latex.py | 8 ++- librarian/embeds/mathml.py | 5 +- librarian/epub.py | 60 ++++++++-------- librarian/fb2.py | 5 +- librarian/functions.py | 4 +- librarian/html.py | 33 +++++---- librarian/hyphenator.py | 3 +- librarian/mobi.py | 14 ++-- librarian/packagers.py | 12 ++-- librarian/parser.py | 22 +++--- librarian/partners.py | 13 ++-- librarian/pdf.py | 31 +++++---- librarian/picture.py | 20 +++--- librarian/sponsor.py | 2 + librarian/text.py | 9 ++- librarian/util.py | 28 ++++---- scripts/book2cover | 4 +- scripts/book2epub | 2 + scripts/book2fb2 | 2 + scripts/book2html | 2 + scripts/book2mobi | 2 + scripts/book2partner | 19 ++--- scripts/book2pdf | 2 + scripts/book2txt | 2 + scripts/bookfragments | 6 +- scripts/fn_qualifiers_list_from_redmine.py | 13 ++-- scripts/genslugs | 10 +-- setup.py | 11 +-- .../dcparser/andersen_brzydkie_kaczatko.out | 2 +- tests/files/dcparser/biedrzycki_akslop.out | 2 +- tests/files/dcparser/kochanowski_piesn7.out | 2 +- tests/files/dcparser/mickiewicz_rybka.out | 2 +- tests/files/dcparser/sofokles_antygona.out | 2 +- .../files/text/asnyk_miedzy_nami_expected.fb2 | 46 +++++++++++++ .../files/text/asnyk_miedzy_nami_expected.txt | 2 + .../text/asnyk_miedzy_nami_expected_raw.txt | 22 ++++++ tests/test_dcparser.py | 16 +++-- tests/test_epub.py | 12 ++++ tests/test_fb2.py | 22 ++++++ tests/test_html.py | 12 ++-- tests/test_html_annotations.py | 58 ++++++++-------- tests/test_html_fragments.py | 8 ++- tests/test_mobi.py | 20 ++++++ tests/test_pdf.py | 10 +-- tests/test_picture.py | 3 +- tests/test_text.py | 18 ++++- tests/utils.py | 1 - tox.ini | 28 ++++++++ 58 files changed, 572 insertions(+), 305 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 MANIFEST.in mode change 100644 => 100755 scripts/fn_qualifiers_list_from_redmine.py create mode 100644 tests/files/text/asnyk_miedzy_nami_expected.fb2 create mode 100644 tests/files/text/asnyk_miedzy_nami_expected_raw.txt create mode 100644 tests/test_fb2.py create mode 100644 tests/test_mobi.py create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index b6c0f8a..0660acf 100755 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ build .project .pydevproject .settings +/.tox +/nosetests.xml +/htmlcov diff --git a/AUTHORS.md b/AUTHORS.md index 70fe140..2eab59f 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,9 +1,17 @@ Authors ------- -Originally written by Marek Stępniowski - -Later contributions: +List of people who have contributed to the project, in chronological order: + +* Marek Stępniowski +* Łukasz Rekucki +* Radek Czajka +* Łukasz Anwajler +* Adam Twardoch +* Marcin Koziej +* Michał Górny +* Aleksander Łukasz +* Robert Błaut +* Jan Szejko + - * Łukasz Rekucki - * Radek Czajka diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..dbc3209 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,20 @@ +# Change Log + +This document records all notable changes to Librarian. + +## 1.7 (2019-02-27) + +### Added +- Python 3.4+ support, to existing Python 2.7 support. +- `coverter_path` argument in `mobi.transform`. +- Proper packaging info. +- This changelog. +- Tox configuration for tests. + +### Changed +- `from_bytes` methods replaced all `from_string` methods, + i.e. on: OutputFile, WorkInfo, BookInfo, WLDocument, WLPicture. +- `get_bytes` replaced `get_string` on OutputFile. + +### Removed +- Shims for Python < 2.7. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..af6efac --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,15 @@ +include *.md +include LICENSE +include NOTICE +include tox.ini +recursive-include scripts *.py *.css +recursive-include tests *.py *.xml *.html *.out *.txt *.jpeg +include librarian/xslt/*.xslt +include librarian/xslt/*.xml +include librarian/epub/* +include librarian/pdf/* +include librarian/fb2/* +include librarian/fonts/* +graft librarian/res +graft librarian/font-optimizer + diff --git a/README.md b/README.md index c0e13e9..dea2381 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ License ![AGPL Logo](http://www.gnu.org/graphics/agplv3-155x51.png) - Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska + Copyright © 2008-2019 Fundacja Nowoczesna Polska - For full list of contributors see AUTHORS section at the end. + For full list of contributors see AUTHORS file. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -29,10 +29,12 @@ other formats, which are more suitable for presentation. Currently we support: - * HTML4, XHTML 1.0 + * HTML4, XHTML 1.0 (?) * Plain text * EPUB (XHTML based) + * MOBI * print-ready PDF + * FB2 Other features: @@ -84,13 +86,3 @@ To convert a file to PDF: To extract book fragments marked as "theme": bookfragments file1.xml [file2.xml ...] - - -Authors -------- -Originally written by Marek Stępniowski - -Later contributions: - - * Łukasz Rekucki - * Radek Czajka \ No newline at end of file diff --git a/librarian/__init__.py b/librarian/__init__.py index 9a9e23e..119b6b1 100644 --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -3,28 +3,28 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from __future__ import with_statement +from __future__ import print_function, unicode_literals import os import re import shutil +from tempfile import NamedTemporaryFile import urllib - -from util import makedirs +from lxml import etree +import six +from six.moves.urllib.request import FancyURLopener +from .util import makedirs +@six.python_2_unicode_compatible class UnicodeException(Exception): def __str__(self): - """ Dirty workaround for Python Unicode handling problems. """ - return unicode(self).encode('utf-8') - - def __unicode__(self): """ Dirty workaround for Python Unicode handling problems. """ args = self.args[0] if len(self.args) == 1 else self.args try: - message = unicode(args) + message = six.text_type(args) except UnicodeDecodeError: - message = unicode(args, encoding='utf-8', errors='ignore') + message = six.text_type(args, encoding='utf-8', errors='ignore') return message class ParseError(UnicodeException): @@ -79,6 +79,7 @@ PLMETNS = XMLNamespace("http://dl.psnc.pl/schemas/plmet/") WLNS = EmptyNamespace() +@six.python_2_unicode_compatible class WLURI(object): """Represents a WL URI. Extracts slug from it.""" slug = None @@ -88,7 +89,7 @@ class WLURI(object): '(?P[-a-z0-9]+)/?$') def __init__(self, uri): - uri = unicode(uri) + uri = six.text_type(uri) self.uri = uri self.slug = uri.rstrip('/').rsplit('/', 1)[-1] @@ -104,16 +105,13 @@ class WLURI(object): def from_slug(cls, slug): """Contructs an URI from slug. - >>> WLURI.from_slug('a-slug').uri - u'http://wolnelektury.pl/katalog/lektura/a-slug/' + >>> print(WLURI.from_slug('a-slug').uri) + http://wolnelektury.pl/katalog/lektura/a-slug/ """ uri = 'http://wolnelektury.pl/katalog/lektura/%s/' % slug return cls(uri) - def __unicode__(self): - return self.uri - def __str__(self): return self.uri @@ -146,11 +144,10 @@ class DirDocProvider(DocProvider): def by_slug(self, slug): fname = slug + '.xml' - return open(os.path.join(self.dir, fname)) + return open(os.path.join(self.dir, fname), 'rb') -import lxml.etree as etree -import dcparser +from . import dcparser DEFAULT_BOOKINFO = dcparser.BookInfo( { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, @@ -175,14 +172,14 @@ DEFAULT_BOOKINFO = dcparser.BookInfo( def xinclude_forURI(uri): e = etree.Element(XINS("include")) e.set("href", uri) - return etree.tostring(e, encoding=unicode) + return etree.tostring(e, encoding='unicode') def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO): """Wrap the text within the minimal XML structure with a DC template.""" bookinfo.created_at = creation_date dcstring = etree.tostring(bookinfo.to_etree(), \ - method='xml', encoding=unicode, pretty_print=True) + method='xml', encoding='unicode', pretty_print=True) return u'\n' + dcstring + u'\n\n' + ocrtext + \ u'\n\n' @@ -192,7 +189,7 @@ def serialize_raw(element): b = u'' + (element.text or '') for child in element.iterchildren(): - e = etree.tostring(child, method='xml', encoding=unicode, + e = etree.tostring(child, method='xml', encoding='unicode', pretty_print=True) b += e @@ -212,7 +209,7 @@ def get_resource(path): class OutputFile(object): """Represents a file returned by one of the converters.""" - _string = None + _bytes = None _filename = None def __del__(self): @@ -220,14 +217,14 @@ class OutputFile(object): os.unlink(self._filename) def __nonzero__(self): - return self._string is not None or self._filename is not None + return self._bytes is not None or self._filename is not None @classmethod - def from_string(cls, string): + def from_bytes(cls, bytestring): """Converter returns contents of a file as a string.""" instance = cls() - instance._string = string + instance._bytes = bytestring return instance @classmethod @@ -238,33 +235,31 @@ class OutputFile(object): instance._filename = filename return instance - def get_string(self): - """Get file's contents as a string.""" + def get_bytes(self): + """Get file's contents as a bytestring.""" if self._filename is not None: - with open(self._filename) as f: + with open(self._filename, 'rb') as f: return f.read() else: - return self._string + return self._bytes def get_file(self): """Get file as a file-like object.""" - if self._string is not None: - from StringIO import StringIO - return StringIO(self._string) + if self._bytes is not None: + return six.BytesIO(self._bytes) elif self._filename is not None: - return open(self._filename) + return open(self._filename, 'rb') def get_filename(self): """Get file as a fs path.""" if self._filename is not None: return self._filename - elif self._string is not None: - from tempfile import NamedTemporaryFile + elif self._bytes is not None: temp = NamedTemporaryFile(prefix='librarian-', delete=False) - temp.write(self._string) + temp.write(self._bytes) temp.close() self._filename = temp.name return self._filename @@ -279,6 +274,6 @@ class OutputFile(object): shutil.copy(self.get_filename(), path) -class URLOpener(urllib.FancyURLopener): +class URLOpener(FancyURLopener): version = 'FNP Librarian (http://github.com/fnp/librarian)' urllib._urlopener = URLOpener() diff --git a/librarian/book2anything.py b/librarian/book2anything.py index 0da3b61..948d9fd 100755 --- a/librarian/book2anything.py +++ b/librarian/book2anything.py @@ -4,9 +4,11 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import print_function, unicode_literals + import os.path import optparse - +import six from librarian import DirDocProvider, ParseError from librarian.parser import WLDocument from librarian.cover import make_cover @@ -102,7 +104,10 @@ class Book2Anything(object): try: for main_input in input_filenames: if options.verbose: - print main_input + print(main_input) + + if isinstance(main_input, six.binary_type): + main_input = main_input.decode('utf-8') # Where to find input? if cls.uses_provider: @@ -126,9 +131,9 @@ class Book2Anything(object): doc.save_output_file(output, output_file, options.output_dir, options.make_dir, cls.ext) - except ParseError, e: - print '%(file)s:%(name)s:%(message)s' % { + except ParseError as e: + print('%(file)s:%(name)s:%(message)s' % { 'file': main_input, 'name': e.__class__.__name__, 'message': e - } + }) diff --git a/librarian/cover.py b/librarian/cover.py index 29e24c8..09c8071 100644 --- a/librarian/cover.py +++ b/librarian/cover.py @@ -3,9 +3,11 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + import re from PIL import Image, ImageFont, ImageDraw, ImageFilter -from StringIO import StringIO +from six import BytesIO from librarian import get_resource, OutputFile, URLOpener @@ -69,7 +71,7 @@ class TextBox(object): line_width = self.draw.textsize(line, font=font)[0] line = line.strip() + ' ' - pos_x = (self.max_width - line_width) / 2 + pos_x = (self.max_width - line_width) // 2 if shadow_color: self.shadow_draw.text( @@ -144,7 +146,7 @@ class Cover(object): if format is not None: self.format = format if width and height: - self.height = height * self.width / width + self.height = int(round(height * self.width / width)) scale = max(float(width or 0) / self.width, float(height or 0) / self.height) if scale >= 1: self.scale = scale @@ -171,8 +173,8 @@ class Cover(object): # WL logo if metr.logo_width: logo = Image.open(get_resource('res/wl-logo.png')) - logo = logo.resize((metr.logo_width, logo.size[1] * metr.logo_width / logo.size[0])) - img.paste(logo, ((metr.width - metr.logo_width) / 2, img.size[1] - logo.size[1] - metr.logo_bottom)) + logo = logo.resize((metr.logo_width, int(round(logo.size[1] * metr.logo_width / logo.size[0])))) + img.paste(logo, ((metr.width - metr.logo_width) // 2, img.size[1] - logo.size[1] - metr.logo_bottom)) top = metr.author_top tbox = TextBox( @@ -223,9 +225,9 @@ class Cover(object): return self.final_image().save(*args, **default_kwargs) def output_file(self, *args, **kwargs): - imgstr = StringIO() + imgstr = BytesIO() self.save(imgstr, *args, **kwargs) - return OutputFile.from_string(imgstr.getvalue()) + return OutputFile.from_bytes(imgstr.getvalue()) class WLCover(Cover): @@ -347,9 +349,9 @@ class WLCover(Cover): elif self.box_position == 'bottom': box_top = metr.height - metr.box_bottom_margin - box_img.size[1] else: # Middle. - box_top = (metr.height - box_img.size[1]) / 2 + box_top = (metr.height - box_img.size[1]) // 2 - box_left = metr.bar_width + (metr.width - metr.bar_width - box_img.size[0]) / 2 + box_left = metr.bar_width + (metr.width - metr.bar_width - box_img.size[0]) // 2 # Draw the white box. ImageDraw.Draw(img).rectangle( @@ -389,17 +391,17 @@ class WLCover(Cover): if src.size[0] * trg_size[1] < src.size[1] * trg_size[0]: resized = ( trg_size[0], - src.size[1] * trg_size[0] / src.size[0] + int(round(src.size[1] * trg_size[0] / src.size[0])) ) - cut = (resized[1] - trg_size[1]) / 2 + cut = (resized[1] - trg_size[1]) // 2 src = src.resize(resized, Image.ANTIALIAS) src = src.crop((0, cut, src.size[0], src.size[1] - cut)) else: resized = ( - src.size[0] * trg_size[1] / src.size[1], + int(round(src.size[0] * trg_size[1] / src.size[1])), trg_size[1], ) - cut = (resized[0] - trg_size[0]) / 2 + cut = (resized[0] - trg_size[0]) // 2 src = src.resize(resized, Image.ANTIALIAS) src = src.crop((cut, 0, src.size[0] - cut, src.size[1])) @@ -448,11 +450,11 @@ class LogoWLCover(WLCover): img.paste(gradient, (metr.bar_width, metr.height - metr.gradient_height), mask=gradient_mask) cursor = metr.width - metr.gradient_logo_margin_right - logo_top = metr.height - metr.gradient_height / 2 - metr.gradient_logo_height / 2 - metr.bleed / 2 + logo_top = int(metr.height - metr.gradient_height / 2 - metr.gradient_logo_height / 2 - metr.bleed / 2) for logo_path in self.gradient_logos[::-1]: logo = Image.open(get_resource(logo_path)) logo = logo.resize( - (logo.size[0] * metr.gradient_logo_height / logo.size[1], metr.gradient_logo_height), + (int(round(logo.size[0] * metr.gradient_logo_height / logo.size[1])), metr.gradient_logo_height), Image.ANTIALIAS) cursor -= logo.size[0] img.paste(logo, (cursor, logo_top), mask=logo) diff --git a/librarian/dcparser.py b/librarian/dcparser.py index f8dfaf9..eeb750a 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -3,10 +3,14 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from xml.parsers.expat import ExpatError from datetime import date +from functools import total_ordering import time import re +import six from librarian.util import roman_to_int from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS, @@ -16,7 +20,7 @@ import lxml.etree as etree # ElementTree API using libxml2 from lxml.etree import XMLSyntaxError -class TextPlus(unicode): +class TextPlus(six.text_type): pass @@ -27,6 +31,8 @@ class DatePlus(date): # ============== # = Converters = # ============== +@six.python_2_unicode_compatible +@total_ordering class Person(object): """Single person with last name and a list of first names.""" def __init__(self, last_name, *first_names): @@ -55,13 +61,13 @@ class Person(object): def __eq__(self, right): return self.last_name == right.last_name and self.first_names == right.first_names - def __cmp__(self, other): - return cmp((self.last_name, self.first_names), (other.last_name, other.first_names)) + def __lt__(self, other): + return (self.last_name, self.first_names) < (other.last_name, other.first_names) def __hash__(self): return hash((self.last_name, self.first_names)) - def __unicode__(self): + def __str__(self): if len(self.first_names) > 0: return '%s, %s' % (self.last_name, ' '.join(self.first_names)) else: @@ -83,7 +89,7 @@ for now we will translate this to some single date losing information of course. """ try: # check out the "N. poł X w." syntax - if isinstance(text, str): + if isinstance(text, six.binary_type): text = text.decode("utf-8") century_format = u"(?:([12]) *poł[.]? +)?([MCDXVI]+) *w[.,]*(?: *l[.]? *([0-9]+))?" @@ -94,7 +100,7 @@ for now we will translate this to some single date losing information of course. if m: half = m.group(1) decade = m.group(3) - century = roman_to_int(str(m.group(2))) + century = roman_to_int(m.group(2)) if half is not None: if decade is not None: raise ValueError("Bad date format. Cannot specify both half and decade of century") @@ -114,7 +120,7 @@ for now we will translate this to some single date losing information of course. raise ValueError return DatePlus(t[0], t[1], t[2]) - except ValueError, e: + except ValueError as e: raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.") @@ -123,7 +129,7 @@ def as_person(text): def as_unicode(text): - if isinstance(text, unicode): + if isinstance(text, six.text_type): return text else: return TextPlus(text.decode('utf-8')) @@ -174,7 +180,7 @@ class Field(object): if hasattr(val[0], 'lang'): setattr(nv, 'lang', val[0].lang) return nv - except ValueError, e: + except ValueError as e: raise ValidationError("Field '%s' - invald value: %s" % (self.uri, e.message)) def validate(self, fdict, fallbacks=None, strict=False): @@ -221,9 +227,7 @@ class DCInfo(type): return super(DCInfo, mcs).__new__(mcs, classname, bases, class_dict) -class WorkInfo(object): - __metaclass__ = DCInfo - +class WorkInfo(six.with_metaclass(DCInfo, object)): FIELDS = ( Field(DCNS('creator'), 'authors', as_person, salias='author', multiple=True), Field(DCNS('title'), 'title'), @@ -255,9 +259,8 @@ class WorkInfo(object): ) @classmethod - def from_string(cls, xml, *args, **kwargs): - from StringIO import StringIO - return cls.from_file(StringIO(xml), *args, **kwargs) + def from_bytes(cls, xml, *args, **kwargs): + return cls.from_file(six.BytesIO(xml), *args, **kwargs) @classmethod def from_file(cls, xmlfile, *args, **kwargs): @@ -282,9 +285,9 @@ class WorkInfo(object): # extract data from the element and make the info return cls.from_element(desc_tag, *args, **kwargs) - except XMLSyntaxError, e: + except XMLSyntaxError as e: raise ParseError(e) - except ExpatError, e: + except ExpatError as e: raise ParseError(e) @classmethod @@ -306,7 +309,7 @@ class WorkInfo(object): fv = field_dict.get(e.tag, []) if e.text is not None: text = e.text - if not isinstance(text, unicode): + if not isinstance(text, six.text_type): text = text.decode('utf-8') val = TextPlus(text) val.lang = e.attrib.get(XMLNS('lang'), lang) @@ -394,11 +397,11 @@ class WorkInfo(object): for x in v: e = etree.Element(field.uri) if x is not None: - e.text = unicode(x) + e.text = six.text_type(x) description.append(e) else: e = etree.Element(field.uri) - e.text = unicode(v) + e.text = six.text_type(v) description.append(e) return root @@ -413,9 +416,9 @@ class WorkInfo(object): if field.multiple: if len(v) == 0: continue - v = [unicode(x) for x in v if x is not None] + v = [six.text_type(x) for x in v if x is not None] else: - v = unicode(v) + v = six.text_type(v) dc[field.name] = {'uri': field.uri, 'value': v} rdf['fields'] = dc @@ -430,15 +433,15 @@ class WorkInfo(object): if field.multiple: if len(v) == 0: continue - v = [unicode(x) for x in v if x is not None] + v = [six.text_type(x) for x in v if x is not None] else: - v = unicode(v) + v = six.text_type(v) result[field.name] = v if field.salias: v = getattr(self, field.salias) if v is not None: - result[field.salias] = unicode(v) + result[field.salias] = six.text_type(v) return result diff --git a/librarian/embeds/__init__.py b/librarian/embeds/__init__.py index 3b1abdb..fa74530 100644 --- a/librarian/embeds/__init__.py +++ b/librarian/embeds/__init__.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import importlib from lxml import etree diff --git a/librarian/embeds/latex.py b/librarian/embeds/latex.py index 0201d08..8425d03 100644 --- a/librarian/embeds/latex.py +++ b/librarian/embeds/latex.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +from __future__ import unicode_literals + import os import shutil from subprocess import call, PIPE @@ -10,14 +12,14 @@ from . import DataEmbed, create_embed, downgrades_to class LaTeX(DataEmbed): @downgrades_to('image/png') def to_png(self): - tmpl = open(get_resource('res/embeds/latex/template.tex')).read().decode('utf-8') + tmpl = open(get_resource('res/embeds/latex/template.tex'), 'rb').read().decode('utf-8') tempdir = mkdtemp('-librarian-embed-latex') fpath = os.path.join(tempdir, 'doc.tex') - with open(fpath, 'w') as f: + with open(fpath, 'wb') as f: f.write((tmpl % {'code': self.data}).encode('utf-8')) call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE) call(['convert', '-density', '150', os.path.join(tempdir, 'doc.pdf'), '-trim', os.path.join(tempdir, 'doc.png')]) - pngdata = open(os.path.join(tempdir, 'doc.png')).read() + pngdata = open(os.path.join(tempdir, 'doc.png'), 'rb').read() shutil.rmtree(tempdir) return create_embed('image/png', data=pngdata) diff --git a/librarian/embeds/mathml.py b/librarian/embeds/mathml.py index dd78f05..bd58baf 100644 --- a/librarian/embeds/mathml.py +++ b/librarian/embeds/mathml.py @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- +from __future__ import unicode_literals + from lxml import etree +import six from librarian import get_resource from . import TreeEmbed, create_embed, downgrades_to @@ -9,4 +12,4 @@ class MathML(TreeEmbed): def to_latex(self): xslt = etree.parse(get_resource('res/embeds/mathml/mathml2latex.xslt')) output = self.tree.xslt(xslt) - return create_embed('application/x-latex', data=unicode(output)) + return create_embed('application/x-latex', data=six.text_type(output)) diff --git a/librarian/epub.py b/librarian/epub.py index 333b56f..e9670d5 100644 --- a/librarian/epub.py +++ b/librarian/epub.py @@ -3,13 +3,13 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from __future__ import with_statement +from __future__ import print_function, unicode_literals import os import os.path import re import subprocess -from StringIO import StringIO +from six import BytesIO from copy import deepcopy from mimetypes import guess_type @@ -30,7 +30,7 @@ functions.reg_lang_code_3to2() def squeeze_whitespace(s): - return re.sub(r'\s+', ' ', s) + return re.sub(b'\\s+', b' ', s) def set_hyph_language(source_tree): @@ -38,7 +38,7 @@ def set_hyph_language(source_tree): result = '' text = ''.join(text) with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f: - for line in f: + for line in f.read().decode('latin1').split('\n'): list = line.strip().split('|') if list[0] == text: result = list[2] @@ -77,12 +77,12 @@ def hyphenate_and_fix_conjunctions(source_tree, hyph): def inner_xml(node): """ returns node's text and children as a string - >>> print inner_xml(etree.fromstring('xyz')) + >>> print(inner_xml(etree.fromstring('xyz'))) xyz """ nt = node.text if node.text is not None else '' - return ''.join([nt] + [etree.tostring(child) for child in node]) + return ''.join([nt] + [etree.tostring(child, encoding='unicode') for child in node]) def set_inner_xml(node, text): @@ -90,7 +90,7 @@ def set_inner_xml(node, text): >>> e = etree.fromstring('bxx') >>> set_inner_xml(e, 'xyz') - >>> print etree.tostring(e) + >>> print(etree.tostring(e, encoding='unicode')) xyz """ @@ -102,7 +102,7 @@ def set_inner_xml(node, text): def node_name(node): """ Find out a node's name - >>> print node_name(etree.fromstring('XYZ')) + >>> print(node_name(etree.fromstring('XYZ'))) XYZ """ @@ -122,7 +122,7 @@ def xslt(xml, sheet, **kwargs): xml = etree.ElementTree(xml) with open(sheet) as xsltf: transform = etree.XSLT(etree.parse(xsltf)) - params = dict((key, transform.strparam(value)) for key, value in kwargs.iteritems()) + params = dict((key, transform.strparam(value)) for key, value in kwargs.items()) return transform(xml, **params) @@ -172,8 +172,8 @@ class Stanza(object): >>> s = etree.fromstring("a c c/\\nbx/\\nyc/ \\nd") >>> Stanza(s).versify() - >>> print etree.tostring(s) - a c cbx/ + >>> print(etree.tostring(s, encoding='unicode')) + a ccbx/ ycd """ @@ -325,8 +325,8 @@ class TOC(object): return "\n".join(texts) def html(self): - with open(get_resource('epub/toc.html')) as f: - t = unicode(f.read(), 'utf-8') + with open(get_resource('epub/toc.html'), 'rb') as f: + t = f.read().decode('utf-8') return t % self.html_part() @@ -546,16 +546,16 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, mime = zipfile.ZipInfo() mime.filename = 'mimetype' mime.compress_type = zipfile.ZIP_STORED - mime.extra = '' - zip.writestr(mime, 'application/epub+zip') + mime.extra = b'' + zip.writestr(mime, b'application/epub+zip') zip.writestr( 'META-INF/container.xml', - '' - '' - '' - '' + b'' + b'' + b'' + b'' ) zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png')) @@ -569,7 +569,7 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, if cover is True: cover = make_cover - cover_file = StringIO() + cover_file = BytesIO() bound_cover = cover(document.book_info) bound_cover.save(cover_file) cover_name = 'cover.%s' % bound_cover.ext() @@ -602,12 +602,12 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, annotations = etree.Element('annotations') toc_file = etree.fromstring( - '' - '' - '' + b'' + b'' + b'' ) nav_map = toc_file[-1] @@ -645,7 +645,7 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, '')) spine.append(etree.fromstring( '')) - html_string = open(get_resource('epub/support.html')).read() + html_string = open(get_resource('epub/support.html'), 'rb').read() chars.update(used_chars(etree.fromstring(html_string))) zip.writestr('OPS/support.html', squeeze_whitespace(html_string)) @@ -679,7 +679,7 @@ def transform(wldoc, verbose=False, style=None, html_toc=False, os.path.join(tmpdir, fname)] env = {"PERL_USE_UNSAFE_INC": "1"} if verbose: - print "Running font-optimizer" + print("Running font-optimizer") subprocess.check_call(optimizer_call, env=env) else: dev_null = open(os.devnull, 'w') diff --git a/librarian/fb2.py b/librarian/fb2.py index 25a4c1f..6dd1c35 100644 --- a/librarian/fb2.py +++ b/librarian/fb2.py @@ -3,9 +3,12 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + import os.path from copy import deepcopy from lxml import etree +import six from librarian import functions, OutputFile from .epub import replace_by_verse @@ -62,6 +65,6 @@ def transform(wldoc, verbose=False, result = document.transform(style) - return OutputFile.from_string(unicode(result).encode('utf-8')) + return OutputFile.from_bytes(six.text_type(result).encode('utf-8')) # vim:et diff --git a/librarian/functions.py b/librarian/functions.py index 75e2911..e5a47d6 100644 --- a/librarian/functions.py +++ b/librarian/functions.py @@ -3,6 +3,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from lxml import etree import re @@ -112,7 +114,7 @@ def reg_lang_code_3to2(): result = '' text = ''.join(text) with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f: - for line in f: + for line in f.read().decode('latin1').split('\n'): list = line.strip().split('|') if list[0] == text: result = list[2] diff --git a/librarian/html.py b/librarian/html.py index a566f71..67f0061 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -3,9 +3,10 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import print_function, unicode_literals + import os import re -import cStringIO import copy from lxml import etree @@ -13,6 +14,8 @@ from librarian import XHTMLNS, ParseError, OutputFile from librarian import functions from lxml.etree import XMLSyntaxError, XSLTApplyError +import six + functions.reg_substitute_entities() functions.reg_person_name() @@ -33,11 +36,10 @@ def html_has_content(text): def transform_abstrakt(abstrakt_element): - from cStringIO import StringIO style_filename = get_stylesheet('legacy') style = etree.parse(style_filename) xml = etree.tostring(abstrakt_element) - document = etree.parse(StringIO(xml.replace('abstrakt', 'dlugi_cytat'))) # HACK + document = etree.parse(six.BytesIO(xml.replace('abstrakt', 'dlugi_cytat'))) # HACK result = document.xslt(style) html = re.sub('', '', etree.tostring(result)) return re.sub(']*>', '', html) @@ -77,16 +79,17 @@ def transform(wldoc, stylesheet='legacy', options=None, flags=None): add_table_of_themes(result.getroot()) add_table_of_contents(result.getroot()) - return OutputFile.from_string(etree.tostring( + return OutputFile.from_bytes(etree.tostring( result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')) else: return None except KeyError: raise ValueError("'%s' is not a valid stylesheet.") - except (XMLSyntaxError, XSLTApplyError), e: + except (XMLSyntaxError, XSLTApplyError) as e: raise ParseError(e) +@six.python_2_unicode_compatible class Fragment(object): def __init__(self, id, themes): super(Fragment, self).__init__() @@ -106,7 +109,7 @@ class Fragment(object): try: stack.pop() except IndexError: - print 'CLOSED NON-OPEN TAG:', element + print('CLOSED NON-OPEN TAG:', element) stack.reverse() return self.events + stack @@ -128,7 +131,7 @@ class Fragment(object): return ''.join(result) - def __unicode__(self): + def __str__(self): return self.to_string() @@ -139,7 +142,7 @@ def extract_fragments(input_filename): # iterparse would die on a HTML document parser = etree.HTMLParser(encoding='utf-8') - buf = cStringIO.StringIO() + buf = six.BytesIO() buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8')) buf.seek(0) @@ -173,7 +176,7 @@ def extract_fragments(input_filename): try: fragment = open_fragments[element.get('fid')] except KeyError: - print '%s:closed not open fragment #%s' % (input_filename, element.get('fid')) + print('%s:closed not open fragment #%s' % (input_filename, element.get('fid'))) else: closed_fragments[fragment.id] = fragment del open_fragments[fragment.id] @@ -207,7 +210,7 @@ def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None link_text = prefix anchor = etree.Element('a', href='#%s' % prefix) anchor.set('class', 'anchor') - anchor.text = unicode(link_text) + anchor.text = six.text_type(link_text) parent.insert(index, anchor) if with_target: @@ -247,7 +250,7 @@ def raw_printable_text(element): for e in working.findall('a'): if e.get('class') in ('annotation', 'theme-begin'): e.text = '' - return etree.tostring(working, method='text', encoding=unicode).strip() + return etree.tostring(working, method='text', encoding='unicode').strip() def add_table_of_contents(root): @@ -300,7 +303,7 @@ def add_table_of_themes(root): theme_names = [s.strip() for s in fragment.text.split(',')] for theme_name in theme_names: book_themes.setdefault(theme_name, []).append(fragment.get('name')) - book_themes = book_themes.items() + book_themes = list(book_themes.items()) book_themes.sort(key=lambda s: sortify(s[0])) themes_div = etree.Element('div', id="themes") themes_ol = etree.SubElement(themes_div, 'ol') @@ -326,7 +329,7 @@ def extract_annotations(html_path): parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') - re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014') + re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014') if footnotes is not None: for footnote in footnotes.findall('div'): fn_type = footnote.get('class').split('-')[1] @@ -335,8 +338,8 @@ def extract_annotations(html_path): footnote.text = None if len(footnote) and footnote[-1].tail == '\n': footnote[-1].tail = None - text_str = etree.tostring(footnote, method='text', encoding=unicode).strip() - html_str = etree.tostring(footnote, method='html', encoding=unicode).strip() + text_str = etree.tostring(footnote, method='text', encoding='unicode').strip() + html_str = etree.tostring(footnote, method='html', encoding='unicode').strip() match = re_qualifier.match(text_str) if match: diff --git a/librarian/hyphenator.py b/librarian/hyphenator.py index 18d402b..aa5b4c3 100644 --- a/librarian/hyphenator.py +++ b/librarian/hyphenator.py @@ -14,6 +14,7 @@ info@wilbertberendsen.nl License: LGPL. """ +from __future__ import print_function, unicode_literals import sys import re @@ -235,5 +236,5 @@ if __name__ == "__main__": h = Hyphenator(dict_file, left=1, right=1) for i in h(word): - print i + print(i) diff --git a/librarian/mobi.py b/librarian/mobi.py index c3c8f28..6f1f5d6 100644 --- a/librarian/mobi.py +++ b/librarian/mobi.py @@ -3,6 +3,7 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals from copy import deepcopy import os @@ -13,13 +14,16 @@ from librarian import OutputFile def transform(wldoc, verbose=False, sample=None, cover=None, - use_kindlegen=False, flags=None, hyphenate=True, ilustr_path=''): + use_kindlegen=False, flags=None, hyphenate=True, ilustr_path='', + converter_path=None): """ produces a MOBI file wldoc: a WLDocument sample=n: generate sample e-book (with at least n paragraphs) cover: a cover.Cover factory overriding default flags: less-advertising, + converter_path: override path to MOBI converter, + either ebook-convert or kindlegen """ document = deepcopy(wldoc) @@ -40,10 +44,12 @@ def transform(wldoc, verbose=False, sample=None, cover=None, if use_kindlegen: output_file_basename = os.path.basename(output_file.name) - subprocess.check_call(['kindlegen', '-c2', epub.get_filename(), - '-o', output_file_basename], **kwargs) + subprocess.check_call([converter_path or 'kindlegen', + '-c2', epub.get_filename(), + '-o', output_file_basename], **kwargs) else: - subprocess.check_call(['ebook-convert', epub.get_filename(), + subprocess.check_call([converter_path or 'ebook-convert', + epub.get_filename(), output_file.name, '--no-inline-toc', '--mobi-file-type=both', '--mobi-ignore-margins'], **kwargs) diff --git a/librarian/packagers.py b/librarian/packagers.py index f57a983..b3f5548 100644 --- a/librarian/packagers.py +++ b/librarian/packagers.py @@ -3,11 +3,13 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import print_function, unicode_literals + import os from librarian import pdf, epub, mobi, DirDocProvider, ParseError from librarian.parser import WLDocument -from util import makedirs +from .util import makedirs class Packager(object): @@ -39,14 +41,14 @@ class Packager(object): try: for main_input in input_filenames: if verbose: - print main_input + print(main_input) cls.prepare_file(main_input, output_dir, verbose, overwrite) - except ParseError, e: - print '%(file)s:%(name)s:%(message)s' % { + except ParseError as e: + print('%(file)s:%(name)s:%(message)s' % { 'file': main_input, 'name': e.__class__.__name__, 'message': e.message - } + }) class EpubPackager(Packager): diff --git a/librarian/parser.py b/librarian/parser.py index 43cb0a9..73ddd52 100644 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -3,6 +3,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian import ValidationError, NoDublinCore, ParseError, NoProvider from librarian import RDFNS from librarian.cover import make_cover @@ -14,7 +16,7 @@ from lxml.etree import XMLSyntaxError, XSLTApplyError import os import re -from StringIO import StringIO +import six class WLDocument(object): @@ -45,14 +47,14 @@ class WLDocument(object): self.book_info = None @classmethod - def from_string(cls, xml, *args, **kwargs): - return cls.from_file(StringIO(xml), *args, **kwargs) + def from_bytes(cls, xml, *args, **kwargs): + return cls.from_file(six.BytesIO(xml), *args, **kwargs) @classmethod def from_file(cls, xmlfile, *args, **kwargs): # first, prepare for parsing - if isinstance(xmlfile, basestring): + if isinstance(xmlfile, six.text_type): file = open(xmlfile, 'rb') try: data = file.read() @@ -61,17 +63,17 @@ class WLDocument(object): else: data = xmlfile.read() - if not isinstance(data, unicode): + if not isinstance(data, six.text_type): data = data.decode('utf-8') data = data.replace(u'\ufeff', '') try: parser = etree.XMLParser(remove_blank_text=False) - tree = etree.parse(StringIO(data.encode('utf-8')), parser) + tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser) return cls(tree, *args, **kwargs) - except (ExpatError, XMLSyntaxError, XSLTApplyError), e: + except (ExpatError, XMLSyntaxError, XSLTApplyError) as e: raise ParseError(e) def swap_endlines(self): @@ -139,7 +141,7 @@ class WLDocument(object): def serialize(self): self.update_dc() - return etree.tostring(self.edoc, encoding=unicode, pretty_print=True) + return etree.tostring(self.edoc, encoding='unicode', pretty_print=True) def merge_chunks(self, chunk_dict): unmerged = [] @@ -150,7 +152,7 @@ class WLDocument(object): node = self.edoc.xpath(xpath)[0] repl = etree.fromstring(u"<%s>%s" % (node.tag, data, node.tag)) node.getparent().replace(node, repl) - except Exception, e: + except Exception as e: unmerged.append(repr((key, xpath, e))) return unmerged @@ -220,7 +222,7 @@ class WLDocument(object): if output_dir_path: save_path = output_dir_path if make_author_dir: - save_path = os.path.join(save_path, unicode(self.book_info.author).encode('utf-8')) + save_path = os.path.join(save_path, six.text_type(self.book_info.author).encode('utf-8')) save_path = os.path.join(save_path, self.book_info.url.slug) if ext: save_path += '.%s' % ext diff --git a/librarian/partners.py b/librarian/partners.py index 33198f7..671cf4d 100644 --- a/librarian/partners.py +++ b/librarian/partners.py @@ -11,9 +11,10 @@ along with custom cover images etc. New partners shouldn't be added here, but in the partners repository. """ +from __future__ import print_function, unicode_literals from librarian import packagers, cover -from util import makedirs +from .util import makedirs class GandalfEpub(packagers.EpubPackager): @@ -79,7 +80,7 @@ class Virtualo(packagers.Packager): try: for main_input in input_filenames: if verbose: - print main_input + print(main_input) path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) slug, ext = os.path.splitext(fname) @@ -110,13 +111,13 @@ class Virtualo(packagers.Packager): doc.save_output_file( doc.as_mobi(doc, cover=cover.VirtualoCover, sample=25), output_path=outfile_sample) - except ParseError, e: - print '%(file)s:%(name)s:%(message)s' % { + except ParseError as e: + print('%(file)s:%(name)s:%(message)s' % { 'file': main_input, 'name': e.__class__.__name__, 'message': e.message - } + }) xml_file = open(os.path.join(output_dir, 'import_products.xml'), 'w') - xml_file.write(etree.tostring(xml, pretty_print=True, encoding=unicode).encode('utf-8')) + xml_file.write(etree.tostring(xml, pretty_print=True, encoding='unicode').encode('utf-8')) xml_file.close() diff --git a/librarian/pdf.py b/librarian/pdf.py index d67bddf..e6d897d 100644 --- a/librarian/pdf.py +++ b/librarian/pdf.py @@ -9,11 +9,11 @@ Creates one big XML from the book and its children, converts it to LaTeX with TeXML, then runs it by XeLaTeX. """ -from __future__ import with_statement +from __future__ import print_function, unicode_literals + import os import os.path import shutil -from StringIO import StringIO from tempfile import mkdtemp, NamedTemporaryFile import re from copy import deepcopy @@ -23,6 +23,7 @@ from itertools import chain from Texml.processor import process from lxml import etree from lxml.etree import XMLSyntaxError, XSLTApplyError +import six from librarian.dcparser import Person from librarian.parser import WLDocument @@ -57,7 +58,7 @@ def insert_tags(doc, split_re, tagname, exclude=None): >>> t = etree.fromstring('A-B-CX-Y-Z') >>> insert_tags(t, re.compile('-'), 'd') - >>> print etree.tostring(t) + >>> print(etree.tostring(t, encoding='unicode')) ABCXYZ """ @@ -196,11 +197,11 @@ def package_available(package, args='', verbose=False): tempdir = mkdtemp('-wl2pdf-test') fpath = os.path.join(tempdir, 'test.tex') f = open(fpath, 'w') - f.write(r""" - \documentclass{wl} - \usepackage[%s]{%s} - \begin{document} - \end{document} + f.write(""" + \\documentclass{wl} + \\usepackage[%s]{%s} + \\begin{document} + \\end{document} """ % (args, package)) f.close() if verbose: @@ -306,8 +307,8 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, del document # no longer needed large object :) tex_path = os.path.join(temp, 'doc.tex') - fout = open(tex_path, 'w') - process(StringIO(texml), fout, 'utf-8') + fout = open(tex_path, 'wb') + process(six.BytesIO(texml), fout, 'utf-8') fout.close() del texml @@ -329,7 +330,7 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, # some things work better when compiled twice # (table of contents, [line numbers - disabled]) - for run in xrange(2): + for run in range(2): if verbose: p = call(['xelatex', tex_path]) else: @@ -346,7 +347,7 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, shutil.rmtree(temp) return OutputFile.from_filename(output_file.name) - except (XMLSyntaxError, XSLTApplyError), e: + except (XMLSyntaxError, XSLTApplyError) as e: raise ParseError(e) @@ -361,14 +362,14 @@ def load_including_children(wldoc=None, provider=None, uri=None): text = f.read().decode('utf-8') f.close() elif wldoc is not None: - text = etree.tostring(wldoc.edoc, encoding=unicode) + text = etree.tostring(wldoc.edoc, encoding='unicode') provider = wldoc.provider else: raise ValueError('Neither a WLDocument, nor provider and URI were provided.') - text = re.sub(ur"([\u0400-\u04ff]+)", ur"\1", text) + text = re.sub(r"([\u0400-\u04ff]+)", r"\1", text) - document = WLDocument.from_string(text, parse_dublincore=True, provider=provider) + document = WLDocument.from_bytes(text.encode('utf-8'), parse_dublincore=True, provider=provider) document.swap_endlines() for child_uri in document.book_info.parts: diff --git a/librarian/picture.py b/librarian/picture.py index 1aa1d07..d255f55 100644 --- a/librarian/picture.py +++ b/librarian/picture.py @@ -1,14 +1,16 @@ # -*- coding: utf-8 -*- +from __future__ import unicode_literals + from operator import and_ -from dcparser import Field, WorkInfo, DCNS +from .dcparser import Field, WorkInfo, DCNS from librarian import (RDFNS, ValidationError, NoDublinCore, ParseError, WLURI) from xml.parsers.expat import ExpatError from os import path -from StringIO import StringIO from lxml import etree from lxml.etree import (XMLSyntaxError, XSLTApplyError, Element) import re +import six class WLPictureURI(WLURI): @@ -99,14 +101,14 @@ class WLPicture(object): self.frame = None @classmethod - def from_string(cls, xml, *args, **kwargs): - return cls.from_file(StringIO(xml), *args, **kwargs) + def from_bytes(cls, xml, *args, **kwargs): + return cls.from_file(six.BytesIO(xml), *args, **kwargs) @classmethod def from_file(cls, xmlfile, parse_dublincore=True, image_store=None): # first, prepare for parsing - if isinstance(xmlfile, basestring): + if isinstance(xmlfile, six.text_type): file = open(xmlfile, 'rb') try: data = file.read() @@ -115,7 +117,7 @@ class WLPicture(object): else: data = xmlfile.read() - if not isinstance(data, unicode): + if not isinstance(data, six.text_type): data = data.decode('utf-8') data = data.replace(u'\ufeff', '') @@ -126,12 +128,12 @@ class WLPicture(object): try: parser = etree.XMLParser(remove_blank_text=False) - tree = etree.parse(StringIO(data.encode('utf-8')), parser) + tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser) me = cls(tree, parse_dublincore=parse_dublincore, image_store=image_store) me.load_frame_info() return me - except (ExpatError, XMLSyntaxError, XSLTApplyError), e: + except (ExpatError, XMLSyntaxError, XSLTApplyError) as e: raise ParseError(e) @property @@ -184,7 +186,7 @@ class WLPicture(object): pd['coords'] = coords def want_unicode(x): - if not isinstance(x, unicode): + if not isinstance(x, six.text_type): return x.decode('utf-8') else: return x diff --git a/librarian/sponsor.py b/librarian/sponsor.py index c9bc35b..1374cda 100644 --- a/librarian/sponsor.py +++ b/librarian/sponsor.py @@ -3,6 +3,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian import get_resource diff --git a/librarian/text.py b/librarian/text.py index 4064849..7ba6d29 100644 --- a/librarian/text.py +++ b/librarian/text.py @@ -3,10 +3,13 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + import copy from librarian import functions, OutputFile from lxml import etree import os +import six functions.reg_substitute_entities() @@ -103,7 +106,7 @@ def transform(wldoc, flags=None, **options): 'description': description, 'url': url, 'license_description': license_description, - 'text': unicode(result), + 'text': six.text_type(result), 'source': source, 'contributors': contributors, 'funders': funders, @@ -111,5 +114,5 @@ def transform(wldoc, flags=None, **options): 'isbn': isbn, }).encode('utf-8') else: - result = unicode(result).encode('utf-8') - return OutputFile.from_string("\r\n".join(result.splitlines()) + "\r\n") + result = six.text_type(result).encode('utf-8') + return OutputFile.from_bytes(b"\r\n".join(result.splitlines()) + b"\r\n") diff --git a/librarian/util.py b/librarian/util.py index 0886fd5..c302084 100644 --- a/librarian/util.py +++ b/librarian/util.py @@ -2,6 +2,8 @@ # by Paul Winkler # http://code.activestate.com/recipes/81611-roman-numerals/ # PSFL (GPL compatible) +from __future__ import print_function, unicode_literals + import os @@ -18,11 +20,11 @@ def int_to_roman(input): Traceback (most recent call last): ValueError: Argument must be between 1 and 3999 - >>> int_to_roman(1.5) + >>> int_to_roman(1.5) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): TypeError: expected integer, got - >>> for i in range(1, 21): print int_to_roman(i) + >>> for i in range(1, 21): print(int_to_roman(i)) ... I II @@ -44,15 +46,15 @@ def int_to_roman(input): XVIII XIX XX - >>> print int_to_roman(2000) + >>> print(int_to_roman(2000)) MM - >>> print int_to_roman(1999) + >>> print(int_to_roman(1999)) MCMXCIX """ if type(input) != type(1): - raise TypeError, "expected integer, got %s" % type(input) + raise TypeError("expected integer, got %s" % type(input)) if not 0 < input < 4000: - raise ValueError, "Argument must be between 1 and 3999" + raise ValueError("Argument must be between 1 and 3999") ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1) nums = ('M', 'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I') result = "" @@ -66,17 +68,17 @@ def roman_to_int(input): """ Convert a roman numeral to an integer. - >>> r = range(1, 4000) + >>> r = list(range(1, 4000)) >>> nums = [int_to_roman(i) for i in r] >>> ints = [roman_to_int(n) for n in nums] - >>> print r == ints + >>> print(r == ints) 1 >>> roman_to_int('VVVIV') Traceback (most recent call last): ... ValueError: input is not a valid roman numeral: VVVIV - >>> roman_to_int(1) + >>> roman_to_int(1) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... TypeError: expected string, got @@ -90,14 +92,14 @@ def roman_to_int(input): ValueError: input is not a valid roman numeral: IL """ if type(input) != type(""): - raise TypeError, "expected string, got %s" % type(input) + raise TypeError("expected string, got %s" % type(input)) input = input.upper() nums = ['M', 'D', 'C', 'L', 'X', 'V', 'I'] ints = [1000, 500, 100, 50, 10, 5, 1] places = [] for c in input: if not c in nums: - raise ValueError, "input is not a valid roman numeral: %s" % input + raise ValueError("input is not a valid roman numeral: %s" % input) for i in range(len(input)): c = input[i] value = ints[nums.index(c)] @@ -116,9 +118,9 @@ def roman_to_int(input): if int_to_roman(sum) == input: return sum else: - raise ValueError, 'input is not a valid roman numeral: %s' % input + raise ValueError('input is not a valid roman numeral: %s' % input) def makedirs(path): if not os.path.isdir(path): - os.makedirs(path) \ No newline at end of file + os.makedirs(path) diff --git a/scripts/book2cover b/scripts/book2cover index 444563c..a81fc63 100755 --- a/scripts/book2cover +++ b/scripts/book2cover @@ -4,8 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from StringIO import StringIO -from librarian import OutputFile +from __future__ import unicode_literals + from librarian.book2anything import Book2Anything, Option diff --git a/scripts/book2epub b/scripts/book2epub index 7a7a41d..5b906b9 100755 --- a/scripts/book2epub +++ b/scripts/book2epub @@ -4,6 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian.book2anything import Book2Anything, Option diff --git a/scripts/book2fb2 b/scripts/book2fb2 index 584ae99..de4615b 100755 --- a/scripts/book2fb2 +++ b/scripts/book2fb2 @@ -4,6 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian.book2anything import Book2Anything diff --git a/scripts/book2html b/scripts/book2html index 2c1d04e..f6d459d 100755 --- a/scripts/book2html +++ b/scripts/book2html @@ -4,6 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian.book2anything import Book2Anything, Option diff --git a/scripts/book2mobi b/scripts/book2mobi index b283309..b0d0686 100755 --- a/scripts/book2mobi +++ b/scripts/book2mobi @@ -4,6 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian.book2anything import Book2Anything, Option diff --git a/scripts/book2partner b/scripts/book2partner index f1892bb..8982354 100755 --- a/scripts/book2partner +++ b/scripts/book2partner @@ -4,20 +4,15 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import print_function, unicode_literals + +from collections import OrderedDict import inspect import optparse import os import sys from librarian import packagers -try: - from collections import OrderedDict -except ImportError: - try: - from django.utils.datastructures import SortedDict - OrderedDict = SortedDict - except ImportError: - OrderedDict = dict if __name__ == '__main__': @@ -64,12 +59,12 @@ if __name__ == '__main__': if inspect.isclass(package) and issubclass(package, packagers.Packager): packages[package_name] = package if not packages: - print 'No packages found!' + print('No packages found!') if options.list_packages: - print 'Available packages:' + print('Available packages:') for package_name, package in packages.items(): - print ' ', package_name + print(' ', package_name) exit(0) if len(input_filenames) < 1 or not options.packages: @@ -79,6 +74,6 @@ if __name__ == '__main__': used_packages = [packages[p] for p in options.packages.split(',')] for package in used_packages: if options.verbose: - print 'Package:', package.__name__ + print('Package:', package.__name__) package.prepare(input_filenames, options.output_dir, options.verbose, options.overwrite) diff --git a/scripts/book2pdf b/scripts/book2pdf index ccb5fac..3c363f1 100755 --- a/scripts/book2pdf +++ b/scripts/book2pdf @@ -4,6 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian.book2anything import Book2Anything, Option diff --git a/scripts/book2txt b/scripts/book2txt index c706a07..0e84ac9 100755 --- a/scripts/book2txt +++ b/scripts/book2txt @@ -4,6 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian.book2anything import Book2Anything, Option from librarian.parser import WLDocument diff --git a/scripts/bookfragments b/scripts/bookfragments index 0d94497..b283297 100755 --- a/scripts/bookfragments +++ b/scripts/bookfragments @@ -4,6 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import print_function, unicode_literals + import os import optparse @@ -29,14 +31,14 @@ if __name__ == '__main__': # Do some real work for input_filename in input_filenames: if options.verbose: - print input_filename + print(input_filename) output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' closed_fragments, open_fragments = html.extract_fragments(input_filename) for fragment_id in open_fragments: - print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) + print('%s:warning:unclosed fragment #%s' % (input_filename, fragment_id)) output_file = open(output_filename, 'w') output_file.write(""" diff --git a/scripts/fn_qualifiers_list_from_redmine.py b/scripts/fn_qualifiers_list_from_redmine.py old mode 100644 new mode 100755 index 020b119..66b00cc --- a/scripts/fn_qualifiers_list_from_redmine.py +++ b/scripts/fn_qualifiers_list_from_redmine.py @@ -5,16 +5,17 @@ This scripts reads the table of footnote qualifiers from Redmine and produces contents of fn_qualifiers.py – a list of valid qualifiers. """ +from __future__ import print_function, unicode_literals from lxml import etree -from urllib2 import urlopen +from six.moves.urllib.request import urlopen url = 'http://redmine.nowoczesnapolska.org.pl/projects/wl-publikacje/wiki/Lista_skr%C3%B3t%C3%B3w' parser = etree.HTMLParser() tree = etree.parse(urlopen(url), parser) -print """\ +print("""\ # -*- coding: utf-8 \""" List of standard footnote qualifiers. @@ -24,12 +25,12 @@ do not edit it. from __future__ import unicode_literals -FN_QUALIFIERS = {""".encode('utf-8') +FN_QUALIFIERS = {""") for td in tree.findall('//td'): - print (" '%s': '%s'," % ( + print((" '%s': '%s'," % ( td[0].text.replace('\\', '\\\\').replace("'", "\\'"), td[0].tail.strip(' -').replace('\\', '\\\\').replace("'", "\\'") - )).encode('utf-8') + ))) -print """ }""".encode('utf-8') +print(""" }""") diff --git a/scripts/genslugs b/scripts/genslugs index a234096..9745b68 100755 --- a/scripts/genslugs +++ b/scripts/genslugs @@ -4,6 +4,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import print_function, unicode_literals + import os import optparse @@ -36,13 +38,13 @@ if __name__ == '__main__': # Do some real work for input_filename in input_filenames: if options.verbose: - print input_filename + print(input_filename) doc = etree.parse(input_filename) try: title = doc.find('//{http://purl.org/dc/elements/1.1/}title').text except AttributeError: - print '%s:error:Book title not found. Skipping.' % input_filename + print('%s:error:Book title not found. Skipping.' % input_filename) continue parent = '' @@ -52,14 +54,14 @@ if __name__ == '__main__': except AttributeError: pass except IndexError: - print '%s:error:Invalid parent URL "%s". Skipping.' % (input_filename, parent_url) + print('%s:error:Invalid parent URL "%s". Skipping.' % (input_filename, parent_url)) book_url = doc.find('//{http://purl.org/dc/elements/1.1/}identifier.url') if book_url is None: book_description = doc.find('//{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description') book_url = etree.SubElement(book_description, '{http://purl.org/dc/elements/1.1/}identifier.url') if not options.force and book_url.text.startswith('http://'): - print '%s:Notice:Book already has identifier URL "%s". Skipping.' % (input_filename, book_url.text) + print('%s:Notice:Book already has identifier URL "%s". Skipping.' % (input_filename, book_url.text)) continue book_url.text = BOOK_URL + slughifi(parent + title)[:60] diff --git a/setup.py b/setup.py index 10abe6e..b391f0c 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ # import os import os.path -from distutils.core import setup +from setuptools import setup def whole_tree(prefix, path): files = [] @@ -21,7 +21,7 @@ def whole_tree(prefix, path): setup( name='librarian', - version='1.6', + version='1.7', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', author="Marek Stępniowski", author_email='marek@stepniowski.com', @@ -29,13 +29,15 @@ setup( maintainer_email='radoslaw.czajka@nowoczesnapolska.org.pl', url='http://github.com/fnp/librarian', packages=['librarian', 'librarian.embeds'], - package_data={'librarian': ['xslt/*.xslt', 'epub/*', 'mobi/*', 'pdf/*', 'fb2/*', 'fonts/*'] + + package_data={'librarian': ['xslt/*.xslt', 'xslt/*.xml', 'epub/*', 'pdf/*', 'fb2/*', 'fonts/*'] + whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'res') + whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'font-optimizer')}, include_package_data=True, install_requires=[ - 'lxml>=2.2', + 'lxml>=2.2,<=4.3', 'Pillow', + 'six', + 'texml', ], scripts=['scripts/book2html', 'scripts/book2txt', @@ -47,5 +49,4 @@ setup( 'scripts/book2cover', 'scripts/bookfragments', 'scripts/genslugs'], - tests_require=['nose>=0.11', 'coverage>=3.0.1'], ) diff --git a/tests/files/dcparser/andersen_brzydkie_kaczatko.out b/tests/files/dcparser/andersen_brzydkie_kaczatko.out index c0fb00b..9f07b39 100644 --- a/tests/files/dcparser/andersen_brzydkie_kaczatko.out +++ b/tests/files/dcparser/andersen_brzydkie_kaczatko.out @@ -1,5 +1,5 @@ { - 'publisher': u'Fundacja Nowoczesna Polska', + 'publisher': [u'Fundacja Nowoczesna Polska'], 'about': u'http://wiki.wolnepodreczniki.pl/Lektury:Andersen/Brzydkie_kaczątko', 'source_name': u'Andersen, Hans Christian (1805-1875), Baśnie, Gebethner i Wolff, wyd. 7, Kraków, 1925', 'author': u'Andersen, Hans Christian', diff --git a/tests/files/dcparser/biedrzycki_akslop.out b/tests/files/dcparser/biedrzycki_akslop.out index a7eeffe..588a4b7 100644 --- a/tests/files/dcparser/biedrzycki_akslop.out +++ b/tests/files/dcparser/biedrzycki_akslop.out @@ -1,6 +1,6 @@ { 'editors': [u'Sekuła, Aleksandra'], - 'publisher': u'Fundacja Nowoczesna Polska', + 'publisher': [u'Fundacja Nowoczesna Polska'], 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Biedrzycki/Akslop', 'source_name': u'Miłosz Biedrzycki, * ("Gwiazdka"), Fundacja "brulion", Kraków-Warszawa, 1993', 'author': u'Biedrzycki, Miłosz', diff --git a/tests/files/dcparser/kochanowski_piesn7.out b/tests/files/dcparser/kochanowski_piesn7.out index b3eba1e..96198a3 100644 --- a/tests/files/dcparser/kochanowski_piesn7.out +++ b/tests/files/dcparser/kochanowski_piesn7.out @@ -1,5 +1,5 @@ { - 'publisher': u'Fundacja Nowoczesna Polska', + 'publisher': [u'Fundacja Nowoczesna Polska'], 'about': u'http://wiki.wolnepodreczniki.pl/Lektury:Kochanowski/Pieśni/Pieśń_VII_(1)', 'source_name': u'Kochanowski, Jan (1530-1584), Dzieła polskie, tom 1, oprac. Julian Krzyżanowski, wyd. 8, Państwowy Instytut Wydawniczy, Warszawa, 1976', 'author': u'Kochanowski, Jan', diff --git a/tests/files/dcparser/mickiewicz_rybka.out b/tests/files/dcparser/mickiewicz_rybka.out index a35f935..f3c76c0 100644 --- a/tests/files/dcparser/mickiewicz_rybka.out +++ b/tests/files/dcparser/mickiewicz_rybka.out @@ -1,6 +1,6 @@ { 'editors': [u'Sekuła, Aleksandra', u'Kallenbach, Józef'], - 'publisher': u'Fundacja Nowoczesna Polska', + 'publisher': [u'Fundacja Nowoczesna Polska'], 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Mickiewicz/Ballady/Rybka', 'source_name': u'Mickiewicz, Adam (1798-1855), Poezje, tom 1 (Wiersze młodzieńcze - Ballady i romanse - Wiersze do r. 1824), Krakowska Spółdzielnia Wydawnicza, wyd. 2 zwiększone, Kraków, 1922', 'author': u'Mickiewicz, Adam', diff --git a/tests/files/dcparser/sofokles_antygona.out b/tests/files/dcparser/sofokles_antygona.out index d934602..477988f 100644 --- a/tests/files/dcparser/sofokles_antygona.out +++ b/tests/files/dcparser/sofokles_antygona.out @@ -1,6 +1,6 @@ { 'editors': [u'Sekuła, Aleksandra'], - 'publisher': u'Fundacja Nowoczesna Polska', + 'publisher': [u'Fundacja Nowoczesna Polska'], 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Sofokles/Antygona', 'source_name': u'Sofokles (496-406 a.C.), Antygona, Zakład Narodowy im. Ossolińskich, wyd. 7, Lwów, 1939', 'author': u'Sofokles', diff --git a/tests/files/text/asnyk_miedzy_nami_expected.fb2 b/tests/files/text/asnyk_miedzy_nami_expected.fb2 new file mode 100644 index 0000000..b9e4e13 --- /dev/null +++ b/tests/files/text/asnyk_miedzy_nami_expected.fb2 @@ -0,0 +1,46 @@ + + + + + <p>Adam Asnyk</p> + <p>Między nami nic nie było</p> + + +

+ Utwór opracowany został w ramach projektu + Wolne Lektury + przez fundację + Nowoczesna Polska. +

+
+
+ + + Między nami nic nie było! + Żadnych zwierzeń, wyznań żadnych! + Nic nas z sobą nie łączyło — + Prócz wiosennych marzeń zdradnych; + + + Prócz tych woni, barw i blasków, + Unoszących się w przestrzeni; + Prócz szumiących śpiewem lasków + I tej świeżej łąk zieleni; + + + Prócz tych kaskad i potoków, + Zraszających każdy parów, + Prócz girlandy tęcz, obłoków, + Prócz natury słodkich czarów; + + + Prócz tych wspólnych, jasnych zdrojów, + Z których serce zachwyt piło; + Prócz pierwiosnków i powojów,— + Między nami nic nie było! + + +
+ + +
diff --git a/tests/files/text/asnyk_miedzy_nami_expected.txt b/tests/files/text/asnyk_miedzy_nami_expected.txt index 3942928..92cc1bd 100644 --- a/tests/files/text/asnyk_miedzy_nami_expected.txt +++ b/tests/files/text/asnyk_miedzy_nami_expected.txt @@ -37,6 +37,8 @@ Ten utwór nie jest objęty majątkowym prawem autorskim i znajduje się w domen Tekst opracowany na podstawie: (Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898 +Wydawca: Fundacja Nowoczesna Polska + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Opracowanie redakcyjne i przypisy: Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska. diff --git a/tests/files/text/asnyk_miedzy_nami_expected_raw.txt b/tests/files/text/asnyk_miedzy_nami_expected_raw.txt new file mode 100644 index 0000000..cac61d8 --- /dev/null +++ b/tests/files/text/asnyk_miedzy_nami_expected_raw.txt @@ -0,0 +1,22 @@ + + +Między nami nic nie było! +Żadnych zwierzeń, wyznań żadnych! +Nic nas z sobą nie łączyło — +Prócz wiosennych marzeń zdradnych; + +Prócz tych woni, barw i blasków, +Unoszących się w przestrzeni; +Prócz szumiących śpiewem lasków +I tej świeżej łąk zieleni; + +Prócz tych kaskad i potoków, +Zraszających każdy parów, +Prócz girlandy tęcz, obłoków, +Prócz natury słodkich czarów; + +Prócz tych wspólnych, jasnych zdrojów, +Z których serce zachwyt piło; +Prócz pierwiosnków i powojów,— +Między nami nic nie było! + diff --git a/tests/test_dcparser.py b/tests/test_dcparser.py index cab5b1c..4dab764 100644 --- a/tests/test_dcparser.py +++ b/tests/test_dcparser.py @@ -3,6 +3,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian import dcparser from lxml import etree from nose.tools import * @@ -13,9 +15,9 @@ from datetime import date def check_dcparser(xml_file, result_file): - xml = file(xml_file).read() + xml = open(xml_file, 'rb').read() result = codecs.open(result_file, encoding='utf-8').read() - info = dcparser.BookInfo.from_string(xml).to_dict() + info = dcparser.BookInfo.from_bytes(xml).to_dict() should_be = eval(result) for key in should_be: assert_equals(info[key], should_be[key]) @@ -28,13 +30,13 @@ def test_dcparser(): def check_serialize(xml_file): - xml = file(xml_file).read() - info = dcparser.BookInfo.from_string(xml) + xml = open(xml_file, 'rb').read() + info = dcparser.BookInfo.from_bytes(xml) # serialize - serialized = etree.tostring(info.to_etree(), encoding=unicode).encode('utf-8') + serialized = etree.tostring(info.to_etree(), encoding='unicode').encode('utf-8') # then parse again - info_bis = dcparser.BookInfo.from_string(serialized) + info_bis = dcparser.BookInfo.from_bytes(serialized) # check if they are the same for key in vars(info): @@ -49,7 +51,7 @@ def test_serialize(): def test_asdate(): - assert_equals(dcparser.as_date(u"2010-10-03"), date(2010, 10, 03)) + assert_equals(dcparser.as_date(u"2010-10-03"), date(2010, 10, 3)) assert_equals(dcparser.as_date(u"2011"), date(2011, 1, 1)) assert_equals(dcparser.as_date(u"2 poł. XIX w."), date(1950, 1, 1)) assert_equals(dcparser.as_date(u"XVII w., l. 20"), date(1720, 1, 1)) diff --git a/tests/test_epub.py b/tests/test_epub.py index 720fec6..4ac874a 100644 --- a/tests/test_epub.py +++ b/tests/test_epub.py @@ -3,6 +3,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from zipfile import ZipFile from lxml import html from nose.tools import * @@ -30,3 +32,13 @@ def test_transform(): u'Opracowanie redakcyjne i przypisy: ' u'Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska.') assert_true(editors_attribution) + + +def test_transform_hyphenate(): + epub = WLDocument.from_file( + get_fixture('text', 'asnyk_zbior.xml'), + provider=DirDocProvider(get_fixture('text', '')) + ).as_epub( + flags=['without_fonts'], + hyphenate=True + ).get_file() diff --git a/tests/test_fb2.py b/tests/test_fb2.py new file mode 100644 index 0000000..2b8de67 --- /dev/null +++ b/tests/test_fb2.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from __future__ import unicode_literals + +from librarian import NoDublinCore +from librarian.parser import WLDocument +from nose.tools import * +from .utils import get_fixture + + +def test_transform(): + expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.fb2') + + text = WLDocument.from_file( + get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') + ).as_fb2().get_bytes() + + assert_equal(text, open(expected_output_file_path, 'rb').read()) + diff --git a/tests/test_html.py b/tests/test_html.py index a0de630..d77d8fe 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -3,10 +3,12 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian import NoDublinCore from librarian.parser import WLDocument from nose.tools import * -from utils import get_fixture +from .utils import get_fixture def test_transform(): @@ -14,9 +16,9 @@ def test_transform(): html = WLDocument.from_file( get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') - ).as_html().get_string() + ).as_html().get_bytes() - assert_equal(html, file(expected_output_file_path).read()) + assert_equal(html, open(expected_output_file_path, 'rb').read()) @raises(NoDublinCore) @@ -35,7 +37,7 @@ def test_passing_parse_dublincore_to_transform(): def test_empty(): - assert not WLDocument.from_string( - '', + assert not WLDocument.from_bytes( + b'', parse_dublincore=False, ).as_html() diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index 234f297..410577c 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -21,73 +21,73 @@ def test_annotations(): ('', ( 'pe', - [], - '', - '

' + [], + '[przypis edytorski]', + '

[przypis edytorski]

' ), 'Empty footnote'), ('Definiendum --- definiens.', ( 'pr', - [], - 'Definiendum \u2014 definiens.', - '

Definiendum \u2014 definiens.

' + [], + 'Definiendum \u2014 definiens. [przypis redakcyjny]', + '

Definiendum \u2014 definiens. [przypis redakcyjny]

' ), 'Plain footnote.'), ('Definiendum --- definiens.', ( 'pt', - [], - 'Definiendum \u2014 definiens.', - '

Definiendum \u2014 definiens.

' + [], + 'Definiendum \u2014 definiens. [przypis tłumacza]', + '

Definiendum \u2014 definiens. [przypis tłumacza]

' ), 'Standard footnote.'), ('Definiendum (łac.) --- definiens.', ( 'pr', - ['łac.'], - 'Definiendum (łac.) \u2014 definiens.', - '

Definiendum (łac.) \u2014 definiens.

' + ['łac.'], + 'Definiendum (łac.) \u2014 definiens. [przypis redakcyjny]', + '

Definiendum (łac.) \u2014 definiens. [przypis redakcyjny]

' ), 'Plain footnote with qualifier'), ('Definiendum (łac.) --- definiens.', ( 'pe', - ['łac.'], - 'Definiendum (łac.) \u2014 definiens.', - '

Definiendum (łac.) \u2014 definiens.

' + ['łac.'], + 'Definiendum (łac.) \u2014 definiens. [przypis edytorski]', + '

Definiendum (łac.) \u2014 definiens. [przypis edytorski]

' ), 'Standard footnote with qualifier.'), (' Definiendum (daw.) --- definiens.', ( 'pt', - ['daw.'], - 'Definiendum (daw.) \u2014 definiens.', - '

Definiendum (daw.) \u2014 definiens.

' + ['daw.'], + 'Definiendum (daw.) \u2014 definiens. [przypis tłumacza]', + '

Definiendum (daw.) \u2014 definiens. [przypis tłumacza]

' ), 'Standard footnote with leading whitespace and qualifier.'), ('Definiendum (łac.) --- definiens.', ( 'pr', - ['łac.'], - 'Definiendum (łac.) \u2014 definiens.', - '

Definiendum (łac.) \u2014 definiens.

' + ['łac.'], + 'Definiendum (łac.) \u2014 definiens. [przypis redakcyjny]', + '

Definiendum (łac.) \u2014 definiens. [przypis redakcyjny]

' ), 'Plain footnote with qualifier and some emphasis.'), ('Definiendum (łac.) --- definiens.', ( 'pe', ['łac.'], - 'Definiendum (łac.) \u2014 definiens.', - '

Definiendum (łac.) \u2014 definiens.

' + 'Definiendum (łac.) \u2014 definiens. [przypis edytorski]', + '

Definiendum (łac.) \u2014 definiens. [przypis edytorski]

' ), 'Standard footnote with qualifier and some emphasis.'), ('Definiendum (łac.) --- definiens (some) --- more text.', ( 'pe', ['łac.'], - 'Definiendum (łac.) \u2014 definiens (some) \u2014 more text.', - '

Definiendum (łac.) \u2014 definiens (some) \u2014 more text.

', + 'Definiendum (łac.) \u2014 definiens (some) \u2014 more text. [przypis edytorski]', + '

Definiendum (łac.) \u2014 definiens (some) \u2014 more text. [przypis edytorski]

', ), 'Footnote with a second parentheses and mdash.'), @@ -96,9 +96,9 @@ def test_annotations(): 'pe', ['daw.', 'niem.'], 'gemajna (daw., z niem. gemein: zwykły) \u2014 częściej: gemajn, ' - 'szeregowiec w wojsku polskim cudzoziemskiego autoramentu.', + 'szeregowiec w wojsku polskim cudzoziemskiego autoramentu. [przypis edytorski]', '

gemajna (daw., z niem. gemein: zwykły) ' - '\u2014 częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.

' + '\u2014 częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu. [przypis edytorski]

' ), 'Footnote with multiple and qualifiers and emphasis.'), @@ -106,7 +106,9 @@ def test_annotations(): xml_src = ''' %s ''' % "".join( t[0] for t in annotations) - html = WLDocument.from_string(xml_src, parse_dublincore=False).as_html().get_file() + html = WLDocument.from_bytes( + xml_src.encode('utf-8'), + parse_dublincore=False).as_html().get_file() res_annotations = list(extract_annotations(html)) for i, (src, expected, name) in enumerate(annotations): diff --git a/tests/test_html_fragments.py b/tests/test_html_fragments.py index 3e87a9e..16057bc 100644 --- a/tests/test_html_fragments.py +++ b/tests/test_html_fragments.py @@ -3,9 +3,11 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian.html import extract_fragments from nose.tools import * -from utils import get_fixture +from .utils import get_fixture def test_fragments(): @@ -14,5 +16,5 @@ def test_fragments(): closed_fragments, open_fragments = extract_fragments( get_fixture('text', 'asnyk_miedzy_nami_expected.html')) assert not open_fragments - fragments_text = u"\n\n".join(u"%s: %s\n%s" % (f.id, f.themes, f) for f in closed_fragments.values()) - assert_equal(fragments_text, file(expected_output_file_path).read().decode('utf-8')) + fragments_text = u"\n\n".join(u"%s: %s\n%s" % (f.id, f.themes, f) for f in sorted(closed_fragments.values(), key=lambda f: f.id)) + assert_equal(fragments_text, open(expected_output_file_path, 'rb').read().decode('utf-8')) diff --git a/tests/test_mobi.py b/tests/test_mobi.py new file mode 100644 index 0000000..3b29e72 --- /dev/null +++ b/tests/test_mobi.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from __future__ import unicode_literals + +from zipfile import ZipFile +from lxml import html +from nose.tools import * +from librarian import DirDocProvider +from librarian.parser import WLDocument +from tests.utils import get_fixture + + +def test_transform(): + mobi = WLDocument.from_file( + get_fixture('text', 'asnyk_zbior.xml'), + provider=DirDocProvider(get_fixture('text', '')) + ).as_mobi(converter_path='true').get_file() diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 5b2dba1..98d1fa6 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -3,11 +3,14 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + +import re from tempfile import NamedTemporaryFile from nose.tools import * from librarian import DirDocProvider from librarian.parser import WLDocument -from utils import get_fixture +from .utils import get_fixture def test_transform(): @@ -17,9 +20,8 @@ def test_transform(): get_fixture('text', 'asnyk_zbior.xml'), provider=DirDocProvider(get_fixture('text', '')) ).as_pdf(save_tex=temp.name) - tex = open(temp.name).read().decode('utf-8') - print tex + tex = open(temp.name, 'rb').read().decode('utf-8') # Check contributor list. - editors = re.search(ur'\\def\\editors\{Opracowanie redakcyjne i przypisy: ([^}]*?)\.\s*\}', tex) + editors = re.search(r'\\def\\editors\{Opracowanie redakcyjne i przypisy: ([^}]*?)\.\s*\}', tex) assert_equal(editors.group(1), u"Adam Fikcyjny, Aleksandra Sekuła, Olga Sutkowska") diff --git a/tests/test_picture.py b/tests/test_picture.py index 00b03ce..f97609b 100644 --- a/tests/test_picture.py +++ b/tests/test_picture.py @@ -3,6 +3,8 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian import picture, dcparser from tests.utils import get_all_fixtures, get_fixture from os import path @@ -46,7 +48,6 @@ def test_picture_parts(): motifs = set() names = set() - print parts for p in parts: for m in p['themes']: motifs.add(m) diff --git a/tests/test_text.py b/tests/test_text.py index 70dfb60..14c728f 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -3,10 +3,12 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from __future__ import unicode_literals + from librarian import NoDublinCore from librarian.parser import WLDocument from nose.tools import * -from utils import get_fixture +from .utils import get_fixture def test_transform(): @@ -14,9 +16,19 @@ def test_transform(): text = WLDocument.from_file( get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') - ).as_text().get_string() + ).as_text().get_bytes() + + assert_equal(text, open(expected_output_file_path, 'rb').read()) + + +def test_transform_raw(): + expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected_raw.txt') + + text = WLDocument.from_file( + get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') + ).as_text(flags=['raw-text']).get_bytes() - assert_equal(text, file(expected_output_file_path).read()) + assert_equal(text, open(expected_output_file_path, 'rb').read()) @raises(NoDublinCore) diff --git a/tests/utils.py b/tests/utils.py index fc87532..7da206c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,7 +3,6 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from __future__ import with_statement from os.path import realpath, join, dirname import glob diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..5b28a3b --- /dev/null +++ b/tox.ini @@ -0,0 +1,28 @@ +[tox] +envlist = + clean, + py{27,34,35,36,37}, + stats + +[testenv] +deps = + nose + coverage +passenv = HOME ; Needed to find locally installed fonts when testing PDF production. +commands = + nosetests --with-coverage --cover-package=librarian -d --with-doctest --with-xunit --exe +install_command = pip install --extra-index-url https://py.mdrn.pl/simple {packages} + +[testenv:clean] +basepython = python2 +commands = + coverage erase +deps = coverage + +[testenv:stats] +basepython = python2 +commands = + coverage report + coverage html +deps = coverage + -- 2.20.1