From 3b0b98465bc1862306b05bb8305a1abbf40ca310 Mon Sep 17 00:00:00 2001 From: Jan Szejko Date: Fri, 1 Jul 2016 14:26:00 +0200 Subject: [PATCH] style --- librarian/__init__.py | 76 ++++++---- librarian/book2anything.py | 117 +++++++-------- librarian/cover.py | 14 +- librarian/dcparser.py | 159 ++++++++++---------- librarian/epub.py | 40 ++--- librarian/fb2.py | 9 +- librarian/functions.py | 12 +- librarian/html.py | 39 +++-- librarian/mobi.py | 16 +- librarian/packagers.py | 34 +++-- librarian/parser.py | 44 +++--- librarian/pdf.py | 36 +++-- librarian/picture.py | 39 ++--- librarian/pyhtml.py | 2 +- librarian/pypdf.py | 195 +++++++++++++------------ librarian/styles/wolnelektury/cover.py | 15 +- librarian/styles/wolnelektury/pdf.py | 13 -- librarian/text.py | 21 ++- librarian/xmlutils.py | 41 +++--- setup.py | 12 +- tests/test_dcparser.py | 1 - tests/test_epub.py | 3 +- tests/test_html.py | 1 + tests/test_iofile.py | 8 +- tests/test_pdf.py | 8 +- tests/test_picture.py | 16 +- tests/test_pyhtml.py | 31 ++-- tests/utils.py | 2 +- 28 files changed, 520 insertions(+), 484 deletions(-) delete mode 100644 librarian/styles/wolnelektury/pdf.py diff --git a/librarian/__init__.py b/librarian/__init__.py index 23244ef..5c145d3 100644 --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -9,6 +9,7 @@ import os import re import shutil import urllib +import lxml.etree as etree class UnicodeException(Exception): @@ -25,22 +26,27 @@ class UnicodeException(Exception): message = unicode(args, encoding='utf-8', errors='ignore') return message + class ParseError(UnicodeException): pass + class ValidationError(UnicodeException): pass + class NoDublinCore(ValidationError): """There's no DublinCore section, and it's required.""" pass + class NoProvider(UnicodeException): """There's no DocProvider specified, and it's needed.""" pass + class XMLNamespace(object): - '''A handy structure to repsent names in an XML namespace.''' + """A handy structure to repsent names in an XML namespace.""" def __init__(self, uri): self.uri = uri @@ -57,6 +63,7 @@ class XMLNamespace(object): def __str__(self): return '%s' % self.uri + class EmptyNamespace(XMLNamespace): def __init__(self): super(EmptyNamespace, self).__init__('') @@ -80,8 +87,9 @@ class WLURI(object): slug = None example = 'http://edukacjamedialna.edu.pl/lekcje/template' - _re_wl_uri = re.compile(r'http://(www\.)?edukacjamedialna.edu.pl/lekcje/' - '(?P[-a-z0-9]+)/?$') + _re_wl_uri = re.compile( + r'http://(www\.)?edukacjamedialna.edu.pl/lekcje/' + '(?P[-a-z0-9]+)/?$') def __init__(self, uri): uri = unicode(uri) @@ -148,43 +156,46 @@ class DirDocProvider(DocProvider): return IOFile.from_filename(os.path.join(self.dir, fname)) -import lxml.etree as etree -import dcparser - -DEFAULT_BOOKINFO = dcparser.BookInfo( - { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, - { - DCNS('creator.expert'): [u'Some, Author'], - DCNS('creator.scenario'): [u'Some, Author'], - DCNS('creator.textbook'): [u'Some, Author'], - DCNS('title'): [u'Some Title'], - DCNS('subject.period'): [u'Unknown'], - DCNS('subject.type'): [u'Unknown'], - DCNS('subject.genre'): [u'Unknown'], - DCNS('date'): ['1970-01-01'], - DCNS('language'): [u'pol'], - # DCNS('date'): [creation_date], - DCNS('publisher'): [u"Fundacja Nowoczesna Polska"], - DCNS('description'): - [u"""Publikacja zrealizowana w ramach projektu - Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa - wykonana przez Bibliotekę Narodową z egzemplarza - pochodzącego ze zbiorów BN."""], - DCNS('identifier.url'): [WLURI.example], - DCNS('rights'): - [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"] }) +def get_default_bookinfo(): + import dcparser + dcparser.BookInfo( + {RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, + { + DCNS('creator.expert'): [u'Some, Author'], + DCNS('creator.scenario'): [u'Some, Author'], + DCNS('creator.textbook'): [u'Some, Author'], + DCNS('title'): [u'Some Title'], + DCNS('subject.period'): [u'Unknown'], + DCNS('subject.type'): [u'Unknown'], + DCNS('subject.genre'): [u'Unknown'], + DCNS('date'): ['1970-01-01'], + DCNS('language'): [u'pol'], + # DCNS('date'): [creation_date], + DCNS('publisher'): [u"Fundacja Nowoczesna Polska"], + DCNS('description'): + [u"""Publikacja zrealizowana w ramach projektu + Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa + wykonana przez Bibliotekę Narodową z egzemplarza + pochodzącego ze zbiorów BN."""], + DCNS('identifier.url'): [WLURI.example], + DCNS('rights'): + [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"], + }) + +DEFAULT_BOOKINFO = get_default_bookinfo() + def xinclude_forURI(uri): e = etree.Element(XINS("include")) e.set("href", uri) return etree.tostring(e, encoding=unicode) + def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO): """Wrap the text within the minimal XML structure with a DC template.""" bookinfo.created_at = creation_date - dcstring = etree.tostring(bookinfo.to_etree(), \ - method='xml', encoding=unicode, pretty_print=True) + dcstring = etree.tostring(bookinfo.to_etree(), encoding=unicode, pretty_print=True) return u'\n' + dcstring + u'\n\n' + ocrtext + \ u'\n\n' @@ -194,8 +205,7 @@ def serialize_raw(element): b = u'' + (element.text or '') for child in element.iterchildren(): - e = etree.tostring(child, method='xml', encoding=unicode, - pretty_print=True) + e = etree.tostring(child, encoding=unicode, pretty_print=True) b += e return b @@ -204,9 +214,11 @@ SERIALIZERS = { 'raw': serialize_raw, } + def serialize_children(element, format='raw'): return SERIALIZERS[format](element) + def get_resource(path): return os.path.join(os.path.dirname(__file__), path) diff --git a/librarian/book2anything.py b/librarian/book2anything.py index b60cd0f..c8726c6 100644 --- a/librarian/book2anything.py +++ b/librarian/book2anything.py @@ -4,7 +4,6 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from collections import namedtuple import os.path import optparse @@ -30,19 +29,18 @@ class Option(object): class Book2Anything(object): """A class for creating book2... scripts. - + Subclass it for any format you want to convert to. """ - format_name = None # Set format name, like "PDF". - ext = None # Set file extension, like "pdf". - uses_cover = False # Can it add a cover? - cover_optional = True # Only relevant if uses_cover - uses_provider = False # Does it need a DocProvider? - transform = None # Transform method. Uses WLDocument.as_{ext} by default. - parser_options = [] # List of Option objects for additional parser args. - transform_options = [] # List of Option objects for additional transform args. - transform_flags = [] # List of Option objects for supported transform flags. - + format_name = None # Set format name, like "PDF". + ext = None # Set file extension, like "pdf". + uses_cover = False # Can it add a cover? + cover_optional = True # Only relevant if uses_cover + uses_provider = False # Does it need a DocProvider? + transform = None # Transform method. Uses WLDocument.as_{ext} by default. + parser_options = [] # List of Option objects for additional parser args. + transform_options = [] # List of Option objects for additional transform args. + transform_flags = [] # List of Option objects for supported transform flags. @classmethod def run(cls): @@ -52,27 +50,33 @@ class Book2Anything(object): parser = optparse.OptionParser(usage=usage) - parser.add_option('-v', '--verbose', - action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - parser.add_option('-d', '--make-dir', - action='store_true', dest='make_dir', default=False, - help='create a directory for author and put the output file in it') - parser.add_option('-o', '--output-file', - dest='output_file', metavar='FILE', - help='specifies the output file') - parser.add_option('-O', '--output-dir', - dest='output_dir', metavar='DIR', - help='specifies the directory for output') + parser.add_option( + '-v', '--verbose', + action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + parser.add_option( + '-d', '--make-dir', + action='store_true', dest='make_dir', default=False, + help='create a directory for author and put the output file in it') + parser.add_option( + '-o', '--output-file', + dest='output_file', metavar='FILE', + help='specifies the output file') + parser.add_option( + '-O', '--output-dir', + dest='output_dir', metavar='DIR', + help='specifies the directory for output') if cls.uses_cover: if cls.cover_optional: - parser.add_option('-c', '--with-cover', - action='store_true', dest='with_cover', default=False, - help='create default cover') - parser.add_option('-C', '--image-cache', - dest='image_cache', metavar='URL', - help='prefix for image download cache' + - (' (implies --with-cover)' if cls.cover_optional else '')) + parser.add_option( + '-c', '--with-cover', + action='store_true', dest='with_cover', default=False, + help='create default cover') + parser.add_option( + '-C', '--image-cache', + dest='image_cache', metavar='URL', + help='prefix for image download cache' + + (' (implies --with-cover)' if cls.cover_optional else '')) for option in cls.parser_options + cls.transform_options + cls.transform_flags: option.add(parser) @@ -80,7 +84,7 @@ class Book2Anything(object): if len(input_filenames) < 1: parser.print_help() - return(1) + return 1 # Prepare additional args for parser. parser_args = {} @@ -91,8 +95,7 @@ class Book2Anything(object): for option in cls.transform_options: transform_args[option.name()] = option.value(options) # Add flags to transform_args, if any. - transform_flags = [flag.name() for flag in cls.transform_flags - if flag.value(options)] + transform_flags = [flag.name() for flag in cls.transform_flags if flag.value(options)] if transform_flags: transform_args['flags'] = transform_flags # Add cover support, if any. @@ -105,35 +108,35 @@ class Book2Anything(object): elif not cls.cover_optional or options.with_cover: transform_args['cover'] = WLCover - # Do some real work + main_input = None try: for main_input in input_filenames: if options.verbose: print main_input - # Where to find input? - if cls.uses_provider: - path, fname = os.path.realpath(main_input).rsplit('/', 1) - provider = DirDocProvider(path) - else: - provider = None - - # Where to write output? - if not (options.output_file or options.output_dir): - output_file = os.path.splitext(main_input)[0] + '.' + cls.ext - else: - output_file = None - - # Do the transformation. - doc = WLDocument.from_file(main_input, provider=provider, **parser_args) - transform = cls.transform - if transform is None: - transform = getattr(WLDocument, 'as_%s' % cls.ext) - output = transform(doc, **transform_args) - - doc.save_output_file(output, - output_file, options.output_dir, options.make_dir, cls.ext) + # Where to find input? + if cls.uses_provider: + path, fname = os.path.realpath(main_input).rsplit('/', 1) + provider = DirDocProvider(path) + else: + provider = None + + # Where to write output? + if not (options.output_file or options.output_dir): + output_file = os.path.splitext(main_input)[0] + '.' + cls.ext + else: + output_file = None + + # Do the transformation. + doc = WLDocument.from_file(main_input, provider=provider, **parser_args) + transform = cls.transform + if transform is None: + transform = getattr(WLDocument, 'as_%s' % cls.ext) + output = transform(doc, **transform_args) + + doc.save_output_file( + output, output_file, options.output_dir, options.make_dir, cls.ext) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { diff --git a/librarian/cover.py b/librarian/cover.py index dc64a9c..2320f19 100644 --- a/librarian/cover.py +++ b/librarian/cover.py @@ -113,12 +113,12 @@ class Cover(object): exts = { 'JPEG': 'jpg', 'PNG': 'png', - } + } mime_types = { 'JPEG': 'image/jpeg', 'PNG': 'image/png', - } + } def __init__(self, book_info, format=None): try: @@ -154,24 +154,22 @@ class Cover(object): top = self.author_top tbox = TextBox( self.width - self.author_margin_left - self.author_margin_right, - self.height - top, - ) + self.height - top) author_font = self.author_font or ImageFont.truetype( get_resource('fonts/DejaVuSerif.ttf'), 30) tbox.text(self.pretty_author(), self.author_color, author_font, - self.author_lineskip, self.author_shadow) + self.author_lineskip, self.author_shadow) text_img = tbox.image() img.paste(text_img, (self.author_margin_left, top), text_img) top += text_img.size[1] + self.title_top tbox = TextBox( self.width - self.title_margin_left - self.title_margin_right, - self.height - top, - ) + self.height - top) title_font = self.author_font or ImageFont.truetype( get_resource('fonts/DejaVuSerif.ttf'), 40) tbox.text(self.pretty_title(), self.title_color, title_font, - self.title_lineskip, self.title_shadow) + self.title_lineskip, self.title_shadow) text_img = tbox.image() img.paste(text_img, (self.title_margin_left, top), text_img) diff --git a/librarian/dcparser.py b/librarian/dcparser.py index afcefa0..747ac86 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -10,7 +10,7 @@ import time from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS, WLURI) -import lxml.etree as etree # ElementTree API using libxml2 +import lxml.etree as etree # ElementTree API using libxml2 from lxml.etree import XMLSyntaxError @@ -25,7 +25,7 @@ class Person(object): @classmethod def from_text(cls, text): - parts = [ token.strip() for token in text.split(',') ] + parts = [token.strip() for token in text.split(',')] if len(parts) == 1: surname = parts[0] names = [] @@ -36,7 +36,7 @@ class Person(object): if len(parts[1]) == 0: # there is no non-whitespace data after the comma raise ValueError("Found a comma, but no names given: \"%s\" -> %r." % (text, parts)) - names = [ name for name in parts[1].split() if len(name) ] # all non-whitespace tokens + names = [name for name in parts[1].split() if len(name)] # all non-whitespace tokens return cls(surname, *names) def readable(self): @@ -60,6 +60,7 @@ class Person(object): def __repr__(self): return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names) + def as_date(text): try: try: @@ -70,18 +71,22 @@ def as_date(text): except ValueError, e: raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.") + def as_person(text): return Person.from_text(text) + def as_unicode(text): if isinstance(text, unicode): return text else: return text.decode('utf-8') + def as_wluri_strict(text): return WLURI.strict(text) + class Field(object): def __init__(self, uri, attr_name, validator=as_unicode, strict=None, multiple=False, salias=None, **kwargs): self.uri = uri @@ -91,7 +96,7 @@ class Field(object): self.multiple = multiple self.salias = salias - self.required = kwargs.get('required', True) and not kwargs.has_key('default') + self.required = kwargs.get('required', True) and 'default' not in kwargs self.default = kwargs.get('default', [] if multiple else [None]) def validate_value(self, val, strict=False): @@ -104,7 +109,7 @@ class Field(object): if self.multiple: if validator is None: return val - return [ validator(v) if v is not None else v for v in val ] + return [validator(v) if v is not None else v for v in val] elif len(val) > 1: raise ValidationError("Multiple values not allowed for field '%s'" % self.uri) elif len(val) == 0: @@ -119,7 +124,7 @@ class Field(object): def validate(self, fdict, fallbacks=None, strict=False): if fallbacks is None: fallbacks = {} - if not fdict.has_key(self.uri): + if self.uri not in fdict: if not self.required: # Accept single value for single fields and saliases. if self.name in fallbacks: @@ -145,7 +150,7 @@ class Field(object): class DCInfo(type): - def __new__(meta, classname, bases, class_dict): + def __new__(mcs, classname, bases, class_dict): fields = list(class_dict['FIELDS']) for base in bases[::-1]: @@ -157,41 +162,41 @@ class DCInfo(type): fields.insert(0, field) class_dict['FIELDS'] = tuple(fields) - return super(DCInfo, meta).__new__(meta, classname, bases, class_dict) + return super(DCInfo, mcs).__new__(mcs, classname, bases, class_dict) class WorkInfo(object): __metaclass__ = DCInfo FIELDS = ( - Field( DCNS('creator.expert'), 'authors_expert', as_person, salias='author', required=False, multiple=True), - Field( DCNS('creator.methodologist'), 'authors_methodologist', as_person, salias='author', required=False, multiple=True), - Field( DCNS('creator.scenario'), 'authors_scenario', as_person, salias='author', required=False, multiple=True), - Field( DCNS('creator.textbook'), 'authors_textbook', as_person, salias='author', required=False, multiple=True), - Field( DCNS('requires'), 'requires', required=False, multiple=True), - Field( DCNS('title'), 'title'), - Field( DCNS('type'), 'type', required=False), - - Field( DCNS('contributor.editor'), 'editors', \ - as_person, salias='editor', multiple=True, default=[]), - Field( DCNS('contributor.technical_editor'), 'technical_editors', - as_person, salias='technical_editor', multiple=True, default=[]), - - Field( DCNS('date'), 'created_at', as_date), - Field( DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), - Field( DCNS('publisher'), 'publisher'), - - Field( DCNS('subject.competence'), 'competences', multiple=True, required=False), - Field( DCNS('subject.curriculum'), 'curriculum', multiple=True, required=False), - - Field( DCNS('language'), 'language'), - Field( DCNS('description'), 'description', required=False), - - Field( DCNS('source'), 'source_name', required=False), - Field( DCNS('source.URL'), 'source_url', required=False), - Field( DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict), - Field( DCNS('rights.license'), 'license', required=False), - Field( DCNS('rights'), 'license_description'), + Field(DCNS('creator.expert'), 'authors_expert', as_person, salias='author', required=False, multiple=True), + Field(DCNS('creator.methodologist'), 'authors_methodologist', as_person, salias='author', required=False, + multiple=True), + Field(DCNS('creator.scenario'), 'authors_scenario', as_person, salias='author', required=False, multiple=True), + Field(DCNS('creator.textbook'), 'authors_textbook', as_person, salias='author', required=False, multiple=True), + Field(DCNS('requires'), 'requires', required=False, multiple=True), + Field(DCNS('title'), 'title'), + Field(DCNS('type'), 'type', required=False), + + Field(DCNS('contributor.editor'), 'editors', as_person, salias='editor', multiple=True, default=[]), + Field(DCNS('contributor.technical_editor'), 'technical_editors', as_person, salias='technical_editor', + multiple=True, default=[]), + + Field(DCNS('date'), 'created_at', as_date), + Field(DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), + Field(DCNS('publisher'), 'publisher'), + + Field(DCNS('subject.competence'), 'competences', multiple=True, required=False), + Field(DCNS('subject.curriculum'), 'curriculum', multiple=True, required=False), + + Field(DCNS('language'), 'language'), + Field(DCNS('description'), 'description', required=False), + + Field(DCNS('source'), 'source_name', required=False), + Field(DCNS('source.URL'), 'source_url', required=False), + Field(DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict), + Field(DCNS('rights.license'), 'license', required=False), + Field(DCNS('rights'), 'license_description'), ) @classmethod @@ -203,8 +208,8 @@ class WorkInfo(object): def from_file(cls, xmlfile, *args, **kwargs): desc_tag = None try: - iter = etree.iterparse(xmlfile, ['start', 'end']) - for (event, element) in iter: + elements = etree.iterparse(xmlfile, ['start', 'end']) + for (event, element) in elements: if element.tag == RDFNS('RDF') and event == 'start': desc_tag = element break @@ -214,7 +219,7 @@ class WorkInfo(object): Check if there are rdf:RDF and rdf:Description tags.") # continue 'till the end of RDF section - for (event, element) in iter: + for (event, element) in elements: if element.tag == RDFNS('RDF') and event == 'end': break @@ -252,13 +257,13 @@ class WorkInfo(object): self.fmap = {} for field in self.FIELDS: - value = field.validate(dc_fields, fallbacks=fallbacks, - strict=strict) + value = field.validate(dc_fields, fallbacks=fallbacks, strict=strict) if field.multiple: value = getattr(self, 'prop_' + field.name, []) + value setattr(self, 'prop_' + field.name, value) self.fmap[field.name] = field - if field.salias: self.fmap[field.salias] = field + if field.salias: + self.fmap[field.salias] = field def __getattribute__(self, name): try: @@ -266,7 +271,8 @@ class WorkInfo(object): value = object.__getattribute__(self, 'prop_'+field.name) if field.name == name: return value - else: # singular alias + else: + # singular alias if not field.multiple: raise "OUCH!! for field %s" % name @@ -279,7 +285,8 @@ class WorkInfo(object): field = object.__getattribute__(self, 'fmap')[name] if field.name == name: object.__setattr__(self, 'prop_'+field.name, newvalue) - else: # singular alias + else: + # singular alias if not field.multiple: raise "OUCH! while setting field %s" % name @@ -291,13 +298,13 @@ class WorkInfo(object): """Update using field_dict. Verify correctness, but don't check if all required fields are present.""" for field in self.FIELDS: - if field_dict.has_key(field.name): + if field.name in field_dict: setattr(self, field.name, field_dict[field.name]) - def to_etree(self, parent = None): + def to_etree(self, parent=None): """XML representation of this object.""" - #etree._namespace_map[str(self.RDF)] = 'rdf' - #etree._namespace_map[str(self.DC)] = 'dc' + # etree._namespace_map[str(self.RDF)] = 'rdf' + # etree._namespace_map[str(self.DC)] = 'dc' if parent is None: root = etree.Element(RDFNS('RDF')) @@ -313,7 +320,8 @@ class WorkInfo(object): v = getattr(self, field.name, None) if v is not None: if field.multiple: - if len(v) == 0: continue + if len(v) == 0: + continue for x in v: e = etree.Element(field.uri) if x is not None: @@ -327,16 +335,16 @@ class WorkInfo(object): return root def serialize(self): - rdf = {} - rdf['about'] = { 'uri': RDFNS('about'), 'value': self.about } + rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}} dc = {} for field in self.FIELDS: v = getattr(self, field.name, None) if v is not None: if field.multiple: - if len(v) == 0: continue - v = [ unicode(x) for x in v if x is not None ] + if len(v) == 0: + continue + v = [unicode(x) for x in v if x is not None] else: v = unicode(v) @@ -351,43 +359,38 @@ class WorkInfo(object): if v is not None: if field.multiple: - if len(v) == 0: continue - v = [ unicode(x) for x in v if x is not None ] + if len(v) == 0: + continue + v = [unicode(x) for x in v if x is not None] else: v = unicode(v) result[field.name] = v if field.salias: v = getattr(self, field.salias) - if v is not None: result[field.salias] = unicode(v) + if v is not None: + result[field.salias] = unicode(v) return result class BookInfo(WorkInfo): FIELDS = ( - Field( DCNS('audience'), 'audiences', salias='audience', multiple=True, - required=False), - - Field( DCNS('subject.period'), 'epochs', salias='epoch', multiple=True, - required=False), - Field( DCNS('subject.type'), 'kinds', salias='kind', multiple=True, - required=False), - Field( DCNS('subject.genre'), 'genres', salias='genre', multiple=True, - required=False), - - Field( DCNS('contributor.translator'), 'translators', \ - as_person, salias='translator', multiple=True, default=[]), - Field( DCNS('relation.hasPart'), 'parts', - WLURI, strict=as_wluri_strict, multiple=True, required=False), - Field( DCNS('relation.isVariantOf'), 'variant_of', - WLURI, strict=as_wluri_strict, required=False), - Field( DCNS('relation'), 'relations', - WLURI, strict=as_wluri_strict, multiple=True, required=False), - - Field( DCNS('relation.coverImage.url'), 'cover_url', required=False), - Field( DCNS('relation.coverImage.attribution'), 'cover_by', required=False), - Field( DCNS('relation.coverImage.source'), 'cover_source', required=False), + Field(DCNS('audience'), 'audiences', salias='audience', multiple=True, required=False), + + Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True, required=False), + Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True, required=False), + Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True, required=False), + + Field(DCNS('contributor.translator'), 'translators', as_person, salias='translator', multiple=True, + default=[]), + Field(DCNS('relation.hasPart'), 'parts', WLURI, strict=as_wluri_strict, multiple=True, required=False), + Field(DCNS('relation.isVariantOf'), 'variant_of', WLURI, strict=as_wluri_strict, required=False), + Field(DCNS('relation'), 'relations', WLURI, strict=as_wluri_strict, multiple=True, required=False), + + Field(DCNS('relation.coverImage.url'), 'cover_url', required=False), + Field(DCNS('relation.coverImage.attribution'), 'cover_by', required=False), + Field(DCNS('relation.coverImage.source'), 'cover_source', required=False), ) diff --git a/librarian/epub.py b/librarian/epub.py index 8141eea..01f5c92 100644 --- a/librarian/epub.py +++ b/librarian/epub.py @@ -33,6 +33,7 @@ def inner_xml(node): nt = node.text if node.text is not None else '' return ''.join([nt] + [etree.tostring(child) for child in node]) + def set_inner_xml(node, text): """ sets node's text and children from a string @@ -121,7 +122,7 @@ class Stanza(object): >>> print etree.tostring(s) abx/ ycd - + """ def __init__(self, stanza_elem): self.stanza = stanza_elem @@ -194,7 +195,7 @@ def add_to_manifest(manifest, partno): def add_to_spine(spine, partno): """ Adds a node to the spine section in content.opf file """ - e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno}); + e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno}) spine.append(e) @@ -286,7 +287,7 @@ def chop(main_text): # prepare a container for each chunk part_xml = etree.Element('utwor') etree.SubElement(part_xml, 'master') - main_xml_part = part_xml[0] # master + main_xml_part = part_xml[0] # master last_node_part = False for one_part in main_text: @@ -304,8 +305,10 @@ def chop(main_text): yield part_xml -def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]): +def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=None): """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """ + if _empty_html_static is None: + _empty_html_static = [] toc = TOC() for element in chunk_xml[0]: @@ -351,8 +354,7 @@ def transform(wldoc, verbose=False, # write book title page html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl')) chars = used_chars(html_tree.getroot()) - zip.writestr('OPS/title.html', - etree.tostring(html_tree, method="html", pretty_print=True)) + zip.writestr('OPS/title.html', etree.tostring(html_tree, method="html", pretty_print=True)) # add a title page TOC entry toc.add(u"Strona tytułowa", "title.html") elif wldoc.book_info.parts: @@ -403,7 +405,6 @@ def transform(wldoc, verbose=False, return toc, chunk_counter, chars, sample - document = deepcopy(wldoc) del wldoc @@ -429,11 +430,12 @@ def transform(wldoc, verbose=False, mime.compress_type = zipfile.ZIP_STORED mime.extra = '' zip.writestr(mime, 'application/epub+zip') - zip.writestr('META-INF/container.xml', '' \ - '' \ - '') + zip.writestr( + 'META-INF/container.xml', '' + '' + '') zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png')) zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png')) if not style: @@ -467,14 +469,14 @@ def transform(wldoc, verbose=False, opf.getroot()[0].append(etree.fromstring('')) guide.append(etree.fromstring('')) - annotations = etree.Element('annotations') - toc_file = etree.fromstring('' \ - '' \ - '') + toc_file = etree.fromstring( + '' + '' + '') nav_map = toc_file[-1] if html_toc: @@ -512,7 +514,7 @@ def transform(wldoc, verbose=False, zip.writestr('OPS/last.html', etree.tostring( html_tree, method="html", pretty_print=True)) - if not flags or not 'without-fonts' in flags: + if not flags or 'without-fonts' not in flags: # strip fonts tmpdir = mkdtemp('-librarian-epub') try: diff --git a/librarian/fb2.py b/librarian/fb2.py index 1e110f5..bc1504d 100644 --- a/librarian/fb2.py +++ b/librarian/fb2.py @@ -17,10 +17,11 @@ functions.reg_person_name() def sectionify(tree): """Finds section headers and adds a tree of _section tags.""" - sections = ['naglowek_czesc', - 'naglowek_akt', 'naglowek_rozdzial', 'naglowek_scena', - 'naglowek_podrozdzial'] - section_level = dict((v,k) for (k,v) in enumerate(sections)) + sections = [ + 'naglowek_czesc', + 'naglowek_akt', 'naglowek_rozdzial', 'naglowek_scena', + 'naglowek_podrozdzial'] + section_level = {v: k for (k, v) in enumerate(sections)} # We can assume there are just subelements an no text at section level. for level, section_name in reversed(list(enumerate(sections))): diff --git a/librarian/functions.py b/librarian/functions.py index bd05ff4..7eb9d56 100644 --- a/librarian/functions.py +++ b/librarian/functions.py @@ -8,6 +8,7 @@ import re from librarian.dcparser import Person + def _register_function(f): """ Register extension function with lxml """ ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') @@ -15,13 +16,14 @@ def _register_function(f): ENTITY_SUBSTITUTIONS = [ - (u'---', u'—'), - (u'--', u'–'), - (u'...', u'…'), - (u',,', u'„'), - (u'"', u'”'), + (u'---', u'—'), + (u'--', u'–'), + (u'...', u'…'), + (u',,', u'„'), + (u'"', u'”'), ] + def substitute_entities(text): """XPath extension function converting all entites in passed text.""" if isinstance(text, list): diff --git a/librarian/html.py b/librarian/html.py index 985970a..848935a 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -22,12 +22,15 @@ STYLESHEETS = { 'partial': 'xslt/wl2html_partial.xslt' } + def get_stylesheet(name): return os.path.join(os.path.dirname(__file__), STYLESHEETS[name]) + def html_has_content(text): return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text) + def transform(wldoc, stylesheet='legacy', options=None, flags=None): """Transforms the WL document to XHTML. @@ -53,14 +56,14 @@ def transform(wldoc, stylesheet='legacy', options=None, flags=None): if not options: options = {} result = document.transform(style, **options) - del document # no longer needed large object :) + del document # no longer needed large object :) if html_has_content(result): add_anchors(result.getroot()) add_table_of_contents(result.getroot()) - return IOFile.from_string(etree.tostring(result, method='html', - xml_declaration=False, pretty_print=True, encoding='utf-8')) + return IOFile.from_string( + etree.tostring(result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')) else: return None except KeyError: @@ -68,6 +71,7 @@ def transform(wldoc, stylesheet='legacy', options=None, flags=None): except (XMLSyntaxError, XSLTApplyError), e: raise ParseError(e) + class Fragment(object): def __init__(self, id, themes): super(Fragment, self).__init__() @@ -96,7 +100,8 @@ class Fragment(object): result = [] for event, element in self.closed_events(): if event == 'start': - result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) + result.append(u'<%s %s>' % ( + element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) if element.text: result.append(element.text) elif event == 'end': @@ -126,7 +131,8 @@ def extract_fragments(input_filename): for event, element in etree.iterparse(buf, events=('start', 'end')): # Process begin and end elements if element.get('class', '') in ('theme-begin', 'theme-end'): - if not event == 'end': continue # Process elements only once, on end event + if not event == 'end': + continue # Process elements only once, on end event # Open new fragment if element.get('class', '') == 'theme-begin': @@ -159,11 +165,10 @@ def extract_fragments(input_filename): for fragment_id in open_fragments: open_fragments[fragment_id].append('text', element.tail) - # Process all elements except begin and end else: # Omit annotation tags - if (len(element.get('name', '')) or + if (len(element.get('name', '')) or element.get('class', '') in ('annotation', 'anchor')): if event == 'end' and element.tail: for fragment_id in open_fragments: @@ -206,10 +211,13 @@ def any_ancestor(element, test): def add_anchors(root): counter = 1 + + def is_side_text(e): + side_classes = ('note', 'motto', 'motto_podpis', 'dedication') + return e.get('class') in side_classes or e.get('id') == 'nota_red' or e.tag == 'blockquote' + for element in root.iterdescendants(): - if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication') - or e.get('id') == 'nota_red' - or e.tag == 'blockquote'): + if any_ancestor(element, is_side_text): continue if element.tag == 'p' and 'verse' in element.get('class', ''): @@ -232,9 +240,13 @@ def raw_printable_text(element): def add_table_of_contents(root): sections = [] counter = 1 + + def is_side_text(e): + return e.get('id') in ('footnotes', 'nota_red') or e.get('class') == 'person-list' + for element in root.iterdescendants(): if element.tag in ('h2', 'h3'): - if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)): + if any_ancestor(element, is_side_text): continue element_text = raw_printable_text(element) @@ -257,9 +269,9 @@ def add_table_of_contents(root): if len(subsections): subsection_list = etree.SubElement(section_element, 'ol') - for n, subsection, text, _ in subsections: + for n1, subsection, text1, _ in subsections: subsection_element = etree.SubElement(subsection_list, 'li') - add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text) + add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=text1) root.insert(0, toc) @@ -276,4 +288,3 @@ def extract_annotations(html_path): text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip() html_str = etree.tostring(footnote, method='html', encoding='utf-8') yield anchor, text_str, html_str - diff --git a/librarian/mobi.py b/librarian/mobi.py index 9558452..104f1c0 100644 --- a/librarian/mobi.py +++ b/librarian/mobi.py @@ -9,7 +9,6 @@ import subprocess from tempfile import NamedTemporaryFile from librarian import IOFile -from librarian.cover import WLCover from librarian import get_resource @@ -28,8 +27,8 @@ def transform(wldoc, verbose=False, book_info = document.book_info # provide a cover by default - if not cover: - cover = WLCover + # if not cover: + # cover = WLCover cover_file = NamedTemporaryFile(suffix='.png', delete=False) bound_cover = cover(book_info) bound_cover.save(cover_file) @@ -43,8 +42,8 @@ def transform(wldoc, verbose=False, if not flags: flags = [] flags = list(flags) + ['without-fonts'] - epub = document.as_epub(verbose=verbose, sample=sample, html_toc=True, - flags=flags, style=get_resource('mobi/style.css')) + epub = document.as_epub( + verbose=verbose, sample=sample, html_toc=True, flags=flags, style=get_resource('mobi/style.css')) if verbose: kwargs = {} @@ -54,7 +53,8 @@ def transform(wldoc, verbose=False, output_file = NamedTemporaryFile(prefix='librarian', suffix='.mobi', delete=False) output_file.close() - subprocess.check_call(['ebook-convert', epub.get_filename(), output_file.name, - '--no-inline-toc', '--cover=%s' % cover_file.name], **kwargs) + subprocess.check_call( + ['ebook-convert', epub.get_filename(), output_file.name, '--no-inline-toc', '--cover=%s' % cover_file.name], + **kwargs) os.unlink(cover_file.name) - return IOFile.from_filename(output_file.name) \ No newline at end of file + return IOFile.from_filename(output_file.name) diff --git a/librarian/packagers.py b/librarian/packagers.py index ddfd7c8..0dbb6e8 100644 --- a/librarian/packagers.py +++ b/librarian/packagers.py @@ -8,38 +8,38 @@ from copy import deepcopy from lxml import etree from librarian import pdf, epub, DirDocProvider, ParseError, cover from librarian.parser import WLDocument +from librarian.styles.wolnelektury.partners import cover class Packager(object): cover = None flags = None + converter = NotImplemented + ext = NotImplemented @classmethod - def prepare_file(cls, main_input, output_dir, verbose=False): + def prepare_file(cls, main_input, output_dir): path, fname = os.path.realpath(main_input).rsplit('/', 1) provider = DirDocProvider(path) slug, ext = os.path.splitext(fname) if output_dir != '': - try: + if not os.path.isdir(output_dir): os.makedirs(output_dir) - except: - pass outfile = os.path.join(output_dir, slug + '.' + cls.ext) doc = WLDocument.from_file(main_input, provider=provider) - output_file = cls.converter.transform(doc, - cover=cls.cover, flags=cls.flags) + output_file = cls.converter.transform(doc, cover=cls.cover, flags=cls.flags) doc.save_output_file(output_file, output_path=outfile) - @classmethod def prepare(cls, input_filenames, output_dir='', verbose=False): + main_input = None try: for main_input in input_filenames: if verbose: print main_input - cls.prepare_file(main_input, output_dir, verbose) + cls.prepare_file(main_input, output_dir) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': main_input, @@ -52,6 +52,7 @@ class EpubPackager(Packager): converter = epub ext = 'epub' + class PdfPackager(Packager): converter = pdf ext = 'pdf' @@ -60,16 +61,20 @@ class PdfPackager(Packager): class GandalfEpubPackager(EpubPackager): cover = cover.GandalfCover + class GandalfPdfPackager(PdfPackager): cover = cover.GandalfCover + class BookotekaEpubPackager(EpubPackager): cover = cover.BookotekaCover + class PrestigioEpubPackager(EpubPackager): cover = cover.PrestigioCover flags = ('less-advertising',) + class PrestigioPdfPackager(PdfPackager): cover = cover.PrestigioCover flags = ('less-advertising',) @@ -107,6 +112,7 @@ class VirtualoPackager(Packager): PL """) + main_input = None try: for main_input in input_filenames: if verbose: @@ -133,17 +139,13 @@ class VirtualoPackager(Packager): cover.VirtualoCover(info).save(os.path.join(outfile_dir, slug+'.jpg')) outfile = os.path.join(outfile_dir, '1.epub') outfile_sample = os.path.join(outfile_dir, '1.sample.epub') - doc.save_output_file(doc.as_epub(), - output_path=outfile) - doc.save_output_file(doc.as_epub(doc, sample=25), - output_path=outfile_sample) + doc.save_output_file(doc.as_epub(), output_path=outfile) + doc.save_output_file(doc.as_epub(doc, sample=25), output_path=outfile_sample) outfile = os.path.join(outfile_dir, '1.mobi') outfile_sample = os.path.join(outfile_dir, '1.sample.mobi') - doc.save_output_file(doc.as_mobi(cover=cover.VirtualoCover), - output_path=outfile) + doc.save_output_file(doc.as_mobi(cover=cover.VirtualoCover), output_path=outfile) doc.save_output_file( - doc.as_mobi(doc, cover=cover.VirtualoCover, sample=25), - output_path=outfile_sample) + doc.as_mobi(doc, cover=cover.VirtualoCover, sample=25), output_path=outfile_sample) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': main_input, diff --git a/librarian/parser.py b/librarian/parser.py index 9300aa6..113fbbe 100644 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -3,7 +3,7 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from librarian import ValidationError, NoDublinCore, ParseError, NoProvider +from librarian import ValidationError, NoDublinCore, ParseError from librarian import RDFNS, IOFile from librarian import dcparser @@ -15,11 +15,13 @@ import os import re from StringIO import StringIO + class WLDocument(object): LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE) provider = None _edoc = None + @property def edoc(self): if self._edoc is None: @@ -28,13 +30,14 @@ class WLDocument(object): data = data.decode('utf-8') data = data.replace(u'\ufeff', '') try: - parser = etree.XMLParser(remove_blank_text=False) + parser = etree.XMLParser() self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser) except (ExpatError, XMLSyntaxError, XSLTApplyError), e: raise ParseError(e) return self._edoc _rdf_elem = None + @property def rdf_elem(self): if self._rdf_elem is None: @@ -45,6 +48,7 @@ class WLDocument(object): return self._rdf_elem _book_info = None + @property def book_info(self): if not self.parse_dublincore: @@ -54,20 +58,19 @@ class WLDocument(object): self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict) return self._book_info - def __init__(self, iofile, provider=None, - parse_dublincore=True, # shouldn't it be in a subclass? - strict=False, # ? - meta_fallbacks=None # ? - ): + def __init__(self, iofile, provider=None, parse_dublincore=True, # shouldn't it be in a subclass? + strict=False, # ? + meta_fallbacks=None): # ? self.source = iofile self.provider = provider self.parse_dublincore = parse_dublincore self.strict = strict self.meta_fallbacks = meta_fallbacks - if self.edoc.getroot().tag != 'utwor': + root_elem = self.edoc.getroot() + if root_elem.tag != 'utwor': raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag) if parse_dublincore: - self.book_info + self.book_info() @classmethod def from_string(cls, xml, *args, **kwargs): @@ -78,7 +81,6 @@ class WLDocument(object): iofile = IOFile.from_filename(xmlfile) return cls(iofile, *args, **kwargs) - def swap_endlines(self): """Converts line breaks in stanzas into
tags.""" # only swap inside stanzas @@ -119,7 +121,7 @@ class WLDocument(object): parts.append(part) else: tag, n = match.groups() - parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) ) + parts.append("*[%d][name() = '%s']" % (int(n)+1, tag)) if parts[0] == '.': parts[0] = '' @@ -132,7 +134,7 @@ class WLDocument(object): def update_dc(self): if self.book_info: parent = self.rdf_elem.getparent() - parent.replace( self.rdf_elem, self.book_info.to_etree(parent) ) + parent.replace(self.rdf_elem, self.book_info.to_etree(parent)) def serialize(self): self.update_dc() @@ -145,18 +147,19 @@ class WLDocument(object): try: xpath = self.path_to_xpath(key) node = self.edoc.xpath(xpath)[0] - repl = etree.fromstring(u"<%s>%s" %(node.tag, data, node.tag) ) + repl = etree.fromstring(u"<%s>%s" % (node.tag, data, node.tag)) node.getparent().replace(node, repl) except Exception, e: - unmerged.append( repr( (key, xpath, e) ) ) + # WTF xpath may be unused; also: too broad except + unmerged.append(repr((key, xpath, e))) return unmerged def clean_ed_note(self): """ deletes forbidden tags from nota_red """ - for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in - ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))): + forbidden_tags = ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw') + for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in forbidden_tags)): tail = node.tail node.clear() node.tag = 'span' @@ -194,15 +197,12 @@ class WLDocument(object): cover_class = WLCover return cover_class(self.book_info, *args, **kwargs).output_file() - def save_output_file(self, output_file, output_path=None, - output_dir_path=None, make_author_dir=False, ext=None): + def save_output_file(self, output_file, output_path=None, output_dir_path=None, make_author_dir=False, ext=None): if output_dir_path: save_path = output_dir_path if make_author_dir: - save_path = os.path.join(save_path, - unicode(self.book_info.author).encode('utf-8')) - save_path = os.path.join(save_path, - self.book_info.uri.slug) + save_path = os.path.join(save_path, unicode(self.book_info.author).encode('utf-8')) + save_path = os.path.join(save_path, self.book_info.uri.slug) if ext: save_path += '.%s' % ext else: diff --git a/librarian/pdf.py b/librarian/pdf.py index 7889a22..2f5c209 100644 --- a/librarian/pdf.py +++ b/librarian/pdf.py @@ -21,7 +21,6 @@ from subprocess import call, PIPE from Texml.processor import process from lxml import etree -from lxml.etree import XMLSyntaxError, XSLTApplyError from librarian.dcparser import Person from librarian.parser import WLDocument @@ -39,11 +38,12 @@ STYLESHEETS = { 'wl2tex': 'pdf/wl2tex.xslt', } + def insert_tags(doc, split_re, tagname, exclude=None): """ inserts for every occurence of `split_re' in text nodes in the `doc' tree - >>> t = etree.fromstring('A-B-CX-Y-Z'); - >>> insert_tags(t, re.compile('-'), 'd'); + >>> t = etree.fromstring('A-B-CX-Y-Z') + >>> insert_tags(t, re.compile('-'), 'd') >>> print etree.tostring(t) ABCXYZ """ @@ -87,10 +87,14 @@ def fix_hanging(doc): def move_motifs_inside(doc): """ moves motifs to be into block elements """ - for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'): + main_tags = ('powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp', + 'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny') + for master in doc.xpath('|'.join('//' + tag for tag in main_tags)): for motif in master.xpath('motyw'): for sib in motif.itersiblings(): - if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'): + special_tags = ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', + 'begin', 'end', 'motyw', 'extra', 'uwaga') + if sib.tag not in special_tags: # motif shouldn't have a tail - it would be untagged text motif.tail = None motif.getparent().remove(motif) @@ -136,9 +140,10 @@ def parse_creator(doc): Finds all dc:creator and dc.contributor.translator tags and adds *_parsed versions with forenames first. """ - for person in doc.xpath("|".join('//dc:'+(tag) for tag in ( - 'creator', 'contributor.translator')), - namespaces = {'dc': str(DCNS)})[::-1]: + persons = doc.xpath( + "|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')), + namespaces={'dc': str(DCNS)})[::-1] + for person in persons: if not person.text: continue p = Person.from_text(person.text) @@ -193,8 +198,7 @@ def load_including_children(wldoc=None, provider=None, uri=None): text = re.sub(ur"([\u0400-\u04ff]+)", ur"\1", text) - document = WLDocument.from_string(text, - parse_dublincore=True, provider=provider) + document = WLDocument.from_string(text, parse_dublincore=True, provider=provider) document.swap_endlines() for child_uri in document.book_info.parts: @@ -246,8 +250,8 @@ class PDFFormat(Format): # Copy style shutil.copy(get_resource('pdf/wl.cls'), temp) shutil.copy(self.style, os.path.join(temp, 'style.sty')) - #for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']: - # shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp) + # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']: + # shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp) # Save attachments if self.cover: @@ -263,13 +267,13 @@ class PDFFormat(Format): cwd = None os.chdir(temp) + p = None if self.verbose: - for i in range(self.tex_passes): + for i in xrange(self.tex_passes): p = call(['xelatex', tex_path]) else: - for i in range(self.tex_passes): - p = call(['xelatex', '-interaction=batchmode', tex_path], - stdout=PIPE, stderr=PIPE) + for i in xrange(self.tex_passes): + p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE) if p: raise ParseError("Error parsing .tex file: %s" % tex_path) diff --git a/librarian/picture.py b/librarian/picture.py index ee3c61d..b665a34 100644 --- a/librarian/picture.py +++ b/librarian/picture.py @@ -1,5 +1,5 @@ - -from dcparser import (as_person, as_date, Field, WorkInfo, DCNS) +# -*- coding: utf-8 -*- +from dcparser import Field, WorkInfo, DCNS from librarian import (RDFNS, ValidationError, NoDublinCore, ParseError, WLURI) from xml.parsers.expat import ExpatError from os import path @@ -10,14 +10,14 @@ import re class WLPictureURI(WLURI): - _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/' - '(?P[-a-z0-9]+)/?$') + _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/(?P[-a-z0-9]+)/?$') @classmethod def from_slug(cls, slug): uri = 'http://wolnelektury.pl/katalog/obraz/%s/' % slug return cls(uri) + def as_wlpictureuri_strict(text): return WLPictureURI.strict(text) @@ -36,15 +36,15 @@ class PictureInfo(WorkInfo): Field(DCNS('description.medium'), 'medium', required=False), Field(DCNS('description.dimensions'), 'original_dimensions', required=False), Field(DCNS('format'), 'mime_type', required=False), - Field(DCNS('identifier.url'), 'url', WLPictureURI, - strict=as_wlpictureuri_strict), - ) + Field(DCNS('identifier.url'), 'url', WLPictureURI, strict=as_wlpictureuri_strict), + ) class ImageStore(object): - EXT = ['gif', 'jpeg', 'png', 'swf', 'psd', 'bmp' - 'tiff', 'tiff', 'jpc', 'jp2', 'jpf', 'jb2', 'swc', - 'aiff', 'wbmp', 'xbm'] + EXT = [ + 'gif', 'jpeg', 'png', 'swf', 'psd', 'bmp' + 'tiff', 'tiff', 'jpc', 'jp2', 'jpf', 'jb2', 'swc', + 'aiff', 'wbmp', 'xbm'] MIME = ['image/gif', 'image/jpeg', 'image/png', 'application/x-shockwave-flash', 'image/psd', 'image/bmp', 'image/tiff', 'image/tiff', 'application/octet-stream', @@ -53,7 +53,7 @@ class ImageStore(object): def __init__(self, dir_): self.dir = dir_ - return super(ImageStore, self).__init__() + super(ImageStore, self).__init__() def path(self, slug, mime_type): """ @@ -94,20 +94,16 @@ class WLPicture(object): else: self.picture_info = None - @classmethod - def from_string(cls, xml, *args, **kwargs): - return cls.from_file(StringIO(xml), *args, **kwargs) - @classmethod def from_file(cls, xmlfile, parse_dublincore=True, image_store=None): # first, prepare for parsing if isinstance(xmlfile, basestring): - file = open(xmlfile, 'rb') + xmlfile = open(xmlfile, 'rb') try: - data = file.read() + data = xmlfile.read() finally: - file.close() + xmlfile.close() else: data = xmlfile.read() @@ -121,7 +117,7 @@ class WLPicture(object): image_store = ImageStore(path.dirname(xmlfile.name)) try: - parser = etree.XMLParser(remove_blank_text=False) + parser = etree.XMLParser() tree = etree.parse(StringIO(data.encode('utf-8')), parser) return cls(tree, parse_dublincore=parse_dublincore, image_store=image_store) @@ -152,14 +148,11 @@ class WLPicture(object): Iterates the parts of this picture and returns them and their metadata """ for part in self.edoc.iter("div"): - pd = {} - pd['type'] = part.get('type') + pd = {'themes': [], 'object': None, 'type': part.get('type')} if pd['type'] == 'area': pd['coords'] = ((int(part.get('x1')), int(part.get('y1'))), (int(part.get('x2')), int(part.get('y2')))) - pd['themes'] = [] - pd['object'] = None parent = part while True: parent = parent.getparent() diff --git a/librarian/pyhtml.py b/librarian/pyhtml.py index 16a2141..163d11c 100644 --- a/librarian/pyhtml.py +++ b/librarian/pyhtml.py @@ -251,7 +251,7 @@ class EduModule(Xmill): subgen = EduModule(self.options) definiens_s = subgen.generate(definiens) else: - print "!! Missing definiendum in source: '%s'" % element.text + print ("!! Missing definiendum in source: '%s'" % element.text).encode('utf-8') return u"
" % self.naglowek_to_anchor(element), u"
" + definiens_s diff --git a/librarian/pypdf.py b/librarian/pypdf.py index bb2881f..9851cb1 100644 --- a/librarian/pypdf.py +++ b/librarian/pypdf.py @@ -18,8 +18,7 @@ from urllib2 import urlopen from lxml import etree -from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close -from librarian.dcparser import Person +from xmlutils import Xmill, ifoption, tag_open_close from librarian import DCNS, get_resource, IOFile from librarian import functions from pdf import PDFFormat, substitute_hyphens, fix_hanging @@ -33,7 +32,8 @@ def escape(really): prefix = (u'' % (really and 1 or 0)) postfix = u'' if isinstance(value, list): - import pdb; pdb.set_trace() + import pdb + pdb.set_trace() if isinstance(value, tuple): return prefix + value[0], value[1] + postfix else: @@ -87,16 +87,15 @@ class EduModule(Xmill): return values def handle_rdf__RDF(self, _): - "skip metadata in generation" + """skip metadata in generation""" return @escape(True) def get_rightsinfo(self, element): rights_lic = self.get_dc(element, 'rights.license', True) - return u'' + \ - (rights_lic and u'%s' % rights_lic or '') +\ - u'%s' % self.get_dc(element, 'rights', True) +\ - u'' + return u'' + (rights_lic and u'%s' % rights_lic or '') + \ + u'%s' % self.get_dc(element, 'rights', True) + \ + u'' @escape(True) def get_authors(self, element, which=None): @@ -116,31 +115,31 @@ class EduModule(Xmill): def handle_utwor(self, element): lines = [ u''' - - - \\documentclass[%s]{wl} - \\usepackage{style}''' % self.options['customization_str'], - self.options['has_cover'] and '\usepackage{makecover}', - (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or - (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or - (self.options['morefloats'] == 'none' and - u'''\\IfFileExists{morefloats.sty}{ - \\usepackage{morefloats} - }{}'''), - u'''\\def\\authors{%s}''' % self.get_authors(element), - u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'), - u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'), - u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'), - - u'''\\author{\\authors}''', - u'''\\title{%s}''' % self.get_title(element), - u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(), - u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element), - u''] + + + \\documentclass[%s]{wl} + \\usepackage{style}''' % self.options['customization_str'], + self.options['has_cover'] and '\usepackage{makecover}', + (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or + (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or + (self.options['morefloats'] == 'none' and + u'''\\IfFileExists{morefloats.sty}{ + \\usepackage{morefloats} + }{}'''), + u'''\\def\\authors{%s}''' % self.get_authors(element), + u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'), + u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'), + u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'), + + u'''\\author{\\authors}''', + u'''\\title{%s}''' % self.get_title(element), + u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(), + u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element), + u'' + ] return u"".join(filter(None, lines)), u'' - @escape(1) def handle_powiesc(self, element): return u""" @@ -154,45 +153,42 @@ class EduModule(Xmill): return u'' % cmd, u'' handle_akap = \ - handle_akap = \ - handle_akap_cd = \ - handle_akap_cd = \ - handle_akap_dialog = \ - handle_akap_dialog = \ - handle_autor_utworu = \ - handle_dedykacja = \ - handle_didaskalia = \ - handle_didask_tekst = \ - handle_dlugi_cytat = \ - handle_dzielo_nadrzedne = \ - handle_lista_osoba = \ - handle_mat = \ - handle_miejsce_czas = \ - handle_motto = \ - handle_motto_podpis = \ - handle_naglowek_akt = \ - handle_naglowek_czesc = \ - handle_naglowek_listy = \ - handle_naglowek_osoba = \ - handle_naglowek_scena = \ - handle_nazwa_utworu = \ - handle_nota = \ - handle_osoba = \ - handle_pa = \ - handle_pe = \ - handle_podtytul = \ - handle_poezja_cyt = \ - handle_pr = \ - handle_pt = \ - handle_sekcja_asterysk = \ - handle_sekcja_swiatlo = \ - handle_separator_linia = \ - handle_slowo_obce = \ - handle_srodtytul = \ - handle_tytul_dziela = \ - handle_wyroznienie = \ - handle_dywiz = \ - handle_texcommand + handle_akap_cd = \ + handle_akap_dialog = \ + handle_autor_utworu = \ + handle_dedykacja = \ + handle_didaskalia = \ + handle_didask_tekst = \ + handle_dlugi_cytat = \ + handle_dzielo_nadrzedne = \ + handle_lista_osoba = \ + handle_mat = \ + handle_miejsce_czas = \ + handle_motto = \ + handle_motto_podpis = \ + handle_naglowek_akt = \ + handle_naglowek_czesc = \ + handle_naglowek_listy = \ + handle_naglowek_osoba = \ + handle_naglowek_scena = \ + handle_nazwa_utworu = \ + handle_nota = \ + handle_osoba = \ + handle_pa = \ + handle_pe = \ + handle_podtytul = \ + handle_poezja_cyt = \ + handle_pr = \ + handle_pt = \ + handle_sekcja_asterysk = \ + handle_sekcja_swiatlo = \ + handle_separator_linia = \ + handle_slowo_obce = \ + handle_srodtytul = \ + handle_tytul_dziela = \ + handle_wyroznienie = \ + handle_dywiz = \ + handle_texcommand def handle_naglowek_rozdzial(self, element): if not self.options['teacher']: @@ -220,6 +216,7 @@ class EduModule(Xmill): def handle_uwaga(self, _e): return None + def handle_extra(self, _e): return None @@ -247,13 +244,16 @@ class EduModule(Xmill): opis = '' n = element.xpath('wskazowki') - if n: wskazowki = submill.generate(n[0]) - - else: wskazowki = '' + if n: + wskazowki = submill.generate(n[0]) + else: + wskazowki = '' n = element.xpath('pomoce') - if n: pomoce = submill.generate(n[0]) - else: pomoce = '' + if n: + pomoce = submill.generate(n[0]) + else: + pomoce = '' forma = ''.join(element.xpath('forma/text()')) @@ -296,7 +296,7 @@ class EduModule(Xmill): def handle_forma(self, *_): return - def handle_lista(self, element, attrs={}): + def handle_lista(self, element, attrs=None): ltype = element.attrib.get('typ', 'punkt') if not element.findall("punkt"): if ltype == 'czytelnia': @@ -309,13 +309,15 @@ class EduModule(Xmill): # print '** missing src on , setting default' surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/' sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string()) - self.options = {'slowniczek': True, 'slowniczek_xml': sxml } + self.options = {'slowniczek': True, 'slowniczek_xml': sxml} - listcmd = {'num': 'enumerate', - 'punkt': 'itemize', - 'alfa': 'itemize', - 'slowniczek': 'itemize', - 'czytelnia': 'itemize'}[ltype] + listcmd = { + 'num': 'enumerate', + 'punkt': 'itemize', + 'alfa': 'itemize', + 'slowniczek': 'itemize', + 'czytelnia': 'itemize' + }[ltype] return u'' % listcmd, u'' @@ -334,7 +336,7 @@ class EduModule(Xmill): typ = element.attrib['typ'] self.exercise_counter += 1 - if not typ in exercise_handlers: + if typ not in exercise_handlers: return '(no handler)' self.options = {'exercise_counter': self.exercise_counter} handler = exercise_handlers[typ](self.options, self.state) @@ -376,14 +378,13 @@ class EduModule(Xmill): max_col = len(ks) self.options = {'columnts': max_col} # styling: - # has_frames = int(element.attrib.get("ramki", "0")) - # if has_frames: frames_c = "framed" - # else: frames_c = "" - # return u"""""" % frames_c, u"
" + # has_frames = int(element.attrib.get("ramki", "0")) + # if has_frames: frames_c = "framed" + # else: frames_c = "" + # return u"""""" % frames_c, u"
" return u''' tabular%s - ''' % ('l' * max_col), \ - u'''tabular''' + ''' % ('l' * max_col), u'''tabular''' @escape(1) def handle_wiersz(self, element): @@ -424,8 +425,7 @@ class EduModule(Xmill): print '!! unknown