From: Radek Czajka Date: Thu, 2 May 2013 10:17:09 +0000 (+0200) Subject: Some experiments with the language: html, epub, covers. X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/13480b3da2d3da87f1d99c6d340c1553ca9d89c1?ds=inline;hp=0cefa871f6f1253be544a39c51e1f66f536805ce Some experiments with the language: html, epub, covers. --- diff --git a/librarian/__init__.py b/librarian/__init__.py index c46d5d1..0616f23 100644 --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -3,12 +3,10 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from __future__ import with_statement - import os import re -import shutil import urllib +from .utils import XMLNamespace class UnicodeException(Exception): @@ -31,31 +29,6 @@ class ParseError(UnicodeException): class ValidationError(UnicodeException): pass -class NoDublinCore(ValidationError): - """There's no DublinCore section, and it's required.""" - pass - -class NoProvider(UnicodeException): - """There's no DocProvider specified, and it's needed.""" - pass - -class XMLNamespace(object): - '''A handy structure to repsent names in an XML namespace.''' - - def __init__(self, uri): - self.uri = uri - - def __call__(self, tag): - return '{%s}%s' % (self.uri, tag) - - def __contains__(self, tag): - return tag.startswith('{' + str(self) + '}') - - def __repr__(self): - return 'XMLNamespace(%r)' % self.uri - - def __str__(self): - return '%s' % self.uri class EmptyNamespace(XMLNamespace): def __init__(self): @@ -72,7 +45,7 @@ XHTMLNS = XMLNamespace("http://www.w3.org/1999/xhtml") NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/") OPFNS = XMLNamespace("http://www.idpf.org/2007/opf") -WLNS = EmptyNamespace() +SSTNS = XMLNamespace('http://nowoczesnapolska.org.pl/sst#') class WLURI(object): @@ -117,165 +90,7 @@ class WLURI(object): return self.slug == other.slug -class DocProvider(object): - """Base class for a repository of XML files. - - Used for generating joined files, like EPUBs. - """ - - def by_slug(self, slug): - """Should return a file-like object with a WL document XML.""" - raise NotImplementedError - - def by_uri(self, uri, wluri=WLURI): - """Should return a file-like object with a WL document XML.""" - wluri = wluri(uri) - return self.by_slug(wluri.slug) - - -class DirDocProvider(DocProvider): - """ Serve docs from a directory of files in form .xml """ - - def __init__(self, dir_): - self.dir = dir_ - self.files = {} - - def by_slug(self, slug): - fname = slug + '.xml' - return open(os.path.join(self.dir, fname)) - - -import lxml.etree as etree -import dcparser - -DEFAULT_BOOKINFO = dcparser.BookInfo( - { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'}, - { DCNS('creator'): [u'Some, Author'], - DCNS('title'): [u'Some Title'], - DCNS('subject.period'): [u'Unknown'], - DCNS('subject.type'): [u'Unknown'], - DCNS('subject.genre'): [u'Unknown'], - DCNS('date'): ['1970-01-01'], - DCNS('language'): [u'pol'], - # DCNS('date'): [creation_date], - DCNS('publisher'): [u"Fundacja Nowoczesna Polska"], - DCNS('description'): - [u"""Publikacja zrealizowana w ramach projektu - Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa - wykonana przez Bibliotekę Narodową z egzemplarza - pochodzącego ze zbiorów BN."""], - DCNS('identifier.url'): [WLURI.example], - DCNS('rights'): - [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"] }) - -def xinclude_forURI(uri): - e = etree.Element(XINS("include")) - e.set("href", uri) - return etree.tostring(e, encoding=unicode) - -def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO): - """Wrap the text within the minimal XML structure with a DC template.""" - bookinfo.created_at = creation_date - - dcstring = etree.tostring(bookinfo.to_etree(), \ - method='xml', encoding=unicode, pretty_print=True) - - return u'\n' + dcstring + u'\n\n' + ocrtext + \ - u'\n\n' - - -def serialize_raw(element): - b = u'' + (element.text or '') - - for child in element.iterchildren(): - e = etree.tostring(child, method='xml', encoding=unicode, - pretty_print=True) - b += e - - return b - -SERIALIZERS = { - 'raw': serialize_raw, -} - -def serialize_children(element, format='raw'): - return SERIALIZERS[format](element) - -def get_resource(path): - return os.path.join(os.path.dirname(__file__), path) - - -class OutputFile(object): - """Represents a file returned by one of the converters.""" - - _string = None - _filename = None - - def __del__(self): - if self._filename: - os.unlink(self._filename) - - def __nonzero__(self): - return self._string is not None or self._filename is not None - - @classmethod - def from_string(cls, string): - """Converter returns contents of a file as a string.""" - - instance = cls() - instance._string = string - return instance - - @classmethod - def from_filename(cls, filename): - """Converter returns contents of a file as a named file.""" - - instance = cls() - instance._filename = filename - return instance - - def get_string(self): - """Get file's contents as a string.""" - - if self._filename is not None: - with open(self._filename) as f: - return f.read() - else: - return self._string - - def get_file(self): - """Get file as a file-like object.""" - - if self._string is not None: - from StringIO import StringIO - return StringIO(self._string) - elif self._filename is not None: - return open(self._filename) - - def get_filename(self): - """Get file as a fs path.""" - - if self._filename is not None: - return self._filename - elif self._string is not None: - from tempfile import NamedTemporaryFile - temp = NamedTemporaryFile(prefix='librarian-', delete=False) - temp.write(self._string) - temp.close() - self._filename = temp.name - return self._filename - else: - return None - - def save_as(self, path): - """Save file to a path. Create directories, if necessary.""" - - dirname = os.path.dirname(os.path.abspath(path)) - if not os.path.isdir(dirname): - os.makedirs(dirname) - shutil.copy(self.get_filename(), path) - - class URLOpener(urllib.FancyURLopener): - version = 'FNP Librarian (http://github.com/fnp/librarian)' + version = 'FNP Librarian (http://git.nowoczesnapolska.org.pl/?p=librarian.git)' urllib._urlopener = URLOpener() + diff --git a/librarian/book2anything.py b/librarian/book2anything.py index b8b8d27..b50cb1c 100755 --- a/librarian/book2anything.py +++ b/librarian/book2anything.py @@ -8,9 +8,8 @@ from collections import namedtuple import os.path import optparse -from librarian import DirDocProvider, ParseError -from librarian.parser import WLDocument -from librarian.cover import WLCover +from librarian import ParseError +from librarian.document import Document class Option(object): @@ -34,47 +33,26 @@ class Book2Anything(object): Subclass it for any format you want to convert to. """ - format_name = None # Set format name, like "PDF". - ext = None # Set file extension, like "pdf". - uses_cover = False # Can it add a cover? - cover_optional = True # Only relevant if uses_cover - uses_provider = False # Does it need a DocProvider? - transform = None # Transform method. Uses WLDocument.as_{ext} by default. - parser_options = [] # List of Option objects for additional parser args. - transform_options = [] # List of Option objects for additional transform args. - transform_flags = [] # List of Option objects for supported transform flags. - + format_cls = None # A formats.Format subclass + document_options = [] # List of Option objects for document options. + format_options = [] # List of Option objects for format customization. + build_options = [] # List of Option objects for build options. @classmethod def run(cls): # Parse commandline arguments usage = """Usage: %%prog [options] SOURCE [SOURCE...] - Convert SOURCE files to %s format.""" % cls.format_name + Convert SOURCE files to %s.""" % cls.format_cls.format_name parser = optparse.OptionParser(usage=usage) parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='print status messages to stdout') - parser.add_option('-d', '--make-dir', - action='store_true', dest='make_dir', default=False, - help='create a directory for author and put the output file in it') parser.add_option('-o', '--output-file', dest='output_file', metavar='FILE', help='specifies the output file') - parser.add_option('-O', '--output-dir', - dest='output_dir', metavar='DIR', - help='specifies the directory for output') - if cls.uses_cover: - if cls.cover_optional: - parser.add_option('-c', '--with-cover', - action='store_true', dest='with_cover', default=False, - help='create default cover') - parser.add_option('-C', '--image-cache', - dest='image_cache', metavar='URL', - help='prefix for image download cache' + - (' (implies --with-cover)' if cls.cover_optional else '')) - for option in cls.parser_options + cls.transform_options + cls.transform_flags: + for option in cls.document_options + cls.format_options + cls.build_options: option.add(parser) options, input_filenames = parser.parse_args() @@ -83,28 +61,18 @@ class Book2Anything(object): parser.print_help() return(1) - # Prepare additional args for parser. - parser_args = {} - for option in cls.parser_options: - parser_args[option.name()] = option.value(options) - # Prepare additional args for transform method. - transform_args = {} - for option in cls.transform_options: - transform_args[option.name()] = option.value(options) - # Add flags to transform_args, if any. - transform_flags = [flag.name() for flag in cls.transform_flags - if flag.value(options)] - if transform_flags: - transform_args['flags'] = transform_flags - # Add cover support, if any. - if cls.uses_cover: - if options.image_cache: - def cover_class(*args, **kwargs): - return WLCover(image_cache=options.image_cache, *args, **kwargs) - transform_args['cover'] = cover_class - elif not cls.cover_optional or options.with_cover: - transform_args['cover'] = WLCover - + # Prepare additional args for document. + document_args = {} + for option in cls.document_options: + document_args[option.name()] = option.value(options) + # Prepare additional args for format. + format_args = {} + for option in cls.format_options: + format_args[option.name()] = option.value(options) + # Prepare additional args for build. + build_args = {} + for option in cls.build_options: + build_args[option.name()] = option.value(options) # Do some real work try: @@ -112,28 +80,18 @@ class Book2Anything(object): if options.verbose: print main_input - # Where to find input? - if cls.uses_provider: - path, fname = os.path.realpath(main_input).rsplit('/', 1) - provider = DirDocProvider(path) - else: - provider = None + # Do the transformation. + doc = Document.from_file(main_input, **document_args) + format_ = cls.format_cls(doc, **format_args) # Where to write output? - if not (options.output_file or options.output_dir): - output_file = os.path.splitext(main_input)[0] + '.' + cls.ext + if not options.output_file: + output_file = os.path.splitext(main_input)[0] + '.' + format_.format_ext else: output_file = None - - # Do the transformation. - doc = WLDocument.from_file(main_input, provider=provider, **parser_args) - transform = cls.transform - if transform is None: - transform = getattr(WLDocument, 'as_%s' % cls.ext) - output = transform(doc, **transform_args) - - doc.save_output_file(output, - output_file, options.output_dir, options.make_dir, cls.ext) + + output = format_.build(**build_args) + output.save_as(output_file) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { diff --git a/librarian/core.py b/librarian/core.py new file mode 100755 index 0000000..0b90a2e --- /dev/null +++ b/librarian/core.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from lxml import etree +from librarian import SSTNS +from .meta import Metadata + + +class TextElement(etree.ElementBase): + @property + def meta(self): + m = self.find(SSTNS('metadata')) + if m is None: + return Metadata.about(self) + return m + + +class Span(TextElement): + pass + + +class Div(TextElement): + pass + + +class Section(TextElement): + pass + + +class Header(TextElement): + pass + + +class Aside(TextElement): + pass diff --git a/librarian/cover.py b/librarian/cover.py deleted file mode 100644 index a37b911..0000000 --- a/librarian/cover.py +++ /dev/null @@ -1,438 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. -# -import re -import Image, ImageFont, ImageDraw, ImageFilter, ImageEnhance -from StringIO import StringIO -from librarian import get_resource, OutputFile, URLOpener - - -class Metric(object): - """Gets metrics from an object, scaling it by a factor.""" - def __init__(self, obj, scale): - self._obj = obj - self._scale = float(scale) - - def __getattr__(self, name): - src = getattr(self._obj, name) - if src and self._scale: - src = type(src)(self._scale * src) - return src - - -class TextBox(object): - """Creates an Image with a series of centered strings.""" - - SHADOW_X = 3 - SHADOW_Y = 3 - SHADOW_BLUR = 3 - - def __init__(self, max_width, max_height, padding_x=None, padding_y=None): - if padding_x is None: - padding_x = self.SHADOW_X + self.SHADOW_BLUR - if padding_y is None: - padding_y = self.SHADOW_Y + self.SHADOW_BLUR - - self.max_width = max_width - self.max_text_width = max_width - 2 * padding_x - self.padding_y = padding_y - self.height = padding_y - self.img = Image.new('RGBA', (max_width, max_height)) - self.draw = ImageDraw.Draw(self.img) - self.shadow_img = None - self.shadow_draw = None - - def skip(self, height): - """Skips some vertical space.""" - self.height += height - - def text(self, text, color='#000', font=None, line_height=20, - shadow_color=None): - """Writes some centered text.""" - text = re.sub(r'\s+', ' ', text) - if shadow_color: - if not self.shadow_img: - self.shadow_img = Image.new('RGBA', self.img.size) - self.shadow_draw = ImageDraw.Draw(self.shadow_img) - while text: - line = text - line_width = self.draw.textsize(line, font=font)[0] - while line_width > self.max_text_width: - parts = line.rsplit(' ', 1) - if len(parts) == 1: - line_width = self.max_text_width - break - line = parts[0] - line_width = self.draw.textsize(line, font=font)[0] - line = line.strip() + ' ' - - pos_x = (self.max_width - line_width) / 2 - - if shadow_color: - self.shadow_draw.text( - (pos_x + self.SHADOW_X, self.height + self.SHADOW_Y), - line, font=font, fill=shadow_color - ) - - self.draw.text((pos_x, self.height), line, font=font, fill=color) - self.height += line_height - # go to next line - text = text[len(line):] - - def image(self): - """Creates the actual Image object.""" - image = Image.new('RGBA', (self.max_width, - self.height + self.padding_y)) - if self.shadow_img: - shadow = self.shadow_img.filter(ImageFilter.BLUR) - image.paste(shadow, (0, 0), shadow) - image.paste(self.img, (0, 0), self.img) - else: - image.paste(self.img, (0, 0)) - return image - - -class Cover(object): - """Abstract base class for cover images generator.""" - width = 600 - height = 800 - background_color = '#fff' - background_img = None - - author_top = 100 - author_margin_left = 20 - author_margin_right = 20 - author_lineskip = 40 - author_color = '#000' - author_shadow = None - author_font_ttf = get_resource('fonts/DejaVuSerif.ttf') - author_font_size = 30 - - title_top = 100 - title_margin_left = 20 - title_margin_right = 20 - title_lineskip = 54 - title_color = '#000' - title_shadow = None - title_font_ttf = get_resource('fonts/DejaVuSerif.ttf') - title_font_size = 40 - - logo_bottom = None - logo_width = None - uses_dc_cover = False - - format = 'JPEG' - scale = 1 - - exts = { - 'JPEG': 'jpg', - 'PNG': 'png', - } - - mime_types = { - 'JPEG': 'image/jpeg', - 'PNG': 'image/png', - } - - def __init__(self, book_info, format=None, width=None, height=None): - self.author = ", ".join(auth.readable() for auth in book_info.authors) - self.title = book_info.title - if format is not None: - self.format = format - scale = max(float(width or 0) / self.width, float(height or 0) / self.height) - if scale: - self.scale = scale - - def pretty_author(self): - """Allows for decorating author's name.""" - return self.author - - def pretty_title(self): - """Allows for decorating title.""" - return self.title - - def image(self): - metr = Metric(self, self.scale) - img = Image.new('RGB', (metr.width, metr.height), self.background_color) - - if self.background_img: - background = Image.open(self.background_img) - img.paste(background, None, background) - del background - - # WL logo - if metr.logo_width: - logo = Image.open(get_resource('res/wl-logo.png')) - logo = logo.resize((metr.logo_width, logo.size[1] * metr.logo_width / logo.size[0])) - img.paste(logo, ((metr.width - metr.logo_width) / 2, img.size[1] - logo.size[1] - metr.logo_bottom)) - - top = metr.author_top - tbox = TextBox( - metr.width - metr.author_margin_left - metr.author_margin_right, - metr.height - top, - ) - - author_font = ImageFont.truetype( - self.author_font_ttf, metr.author_font_size) - tbox.text(self.pretty_author(), self.author_color, author_font, - metr.author_lineskip, self.author_shadow) - text_img = tbox.image() - img.paste(text_img, (metr.author_margin_left, top), text_img) - - top += text_img.size[1] + metr.title_top - tbox = TextBox( - metr.width - metr.title_margin_left - metr.title_margin_right, - metr.height - top, - ) - title_font = ImageFont.truetype( - self.title_font_ttf, metr.title_font_size) - tbox.text(self.pretty_title(), self.title_color, title_font, - metr.title_lineskip, self.title_shadow) - text_img = tbox.image() - img.paste(text_img, (metr.title_margin_left, top), text_img) - - return img - - def mime_type(self): - return self.mime_types[self.format] - - def ext(self): - return self.exts[self.format] - - def save(self, *args, **kwargs): - return self.image().save(format=self.format, quality=95, *args, **kwargs) - - def output_file(self, *args, **kwargs): - imgstr = StringIO() - self.save(imgstr, *args, **kwargs) - return OutputFile.from_string(imgstr.getvalue()) - - -class WLCover(Cover): - """Default Wolne Lektury cover generator.""" - width = 600 - height = 833 - uses_dc_cover = True - author_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') - author_font_size = 20 - author_lineskip = 30 - title_font_ttf = get_resource('fonts/DejaVuSerif-Bold.ttf') - title_font_size = 30 - title_lineskip = 40 - title_box_width = 350 - - box_top_margin = 100 - box_bottom_margin = 100 - box_padding_y = 20 - box_above_line = 10 - box_below_line = 15 - box_line_left = 75 - box_line_right = 275 - box_line_width = 2 - - logo_top = 15 - logo_width = 140 - - bar_width = 35 - background_color = '#444' - author_color = '#444' - default_background = get_resource('res/cover.png') - format = 'JPEG' - - epoch_colors = { - u'Starożytność': '#9e3610', - u'Średniowiecze': '#564c09', - u'Renesans': '#8ca629', - u'Barok': '#a6820a', - u'Oświecenie': '#f2802e', - u'Romantyzm': '#db4b16', - u'Pozytywizm': '#961060', - u'Modernizm': '#7784e0', - u'Dwudziestolecie międzywojenne': '#3044cf', - u'Współczesność': '#06393d', - } - - def __init__(self, book_info, format=None, width=None, height=None, with_logo=False): - super(WLCover, self).__init__(book_info, format=format, width=width, height=height) - self.kind = book_info.kind - self.epoch = book_info.epoch - self.with_logo = with_logo - if book_info.cover_url: - url = book_info.cover_url - bg_src = None - if bg_src is None: - bg_src = URLOpener().open(url) - self.background_img = StringIO(bg_src.read()) - bg_src.close() - else: - self.background_img = self.default_background - - def pretty_author(self): - return self.author.upper() - - def image(self): - metr = Metric(self, self.scale) - img = Image.new('RGB', (metr.width, metr.height), self.background_color) - draw = ImageDraw.Draw(img) - - if self.epoch in self.epoch_colors: - epoch_color = self.epoch_colors[self.epoch] - else: - epoch_color = '#000' - draw.rectangle((0, 0, metr.bar_width, metr.height), fill=epoch_color) - - if self.background_img: - src = Image.open(self.background_img) - trg_size = (metr.width - metr.bar_width, metr.height) - if src.size[0] * trg_size[1] < src.size[1] * trg_size[0]: - resized = ( - trg_size[0], - src.size[1] * trg_size[0] / src.size[0] - ) - cut = (resized[1] - trg_size[1]) / 2 - src = src.resize(resized, Image.ANTIALIAS) - src = src.crop((0, cut, src.size[0], src.size[1] - cut)) - else: - resized = ( - src.size[0] * trg_size[1] / src.size[1], - trg_size[1], - ) - cut = (resized[0] - trg_size[0]) / 2 - src = src.resize(resized, Image.ANTIALIAS) - src = src.crop((cut, 0, src.size[0] - cut, src.size[1])) - - img.paste(src, (metr.bar_width, 0)) - del src - - box = TextBox(metr.title_box_width, metr.height, padding_y=metr.box_padding_y) - author_font = ImageFont.truetype( - self.author_font_ttf, metr.author_font_size) - box.text(self.pretty_author(), - font=author_font, - line_height=metr.author_lineskip, - color=self.author_color, - shadow_color=self.author_shadow, - ) - - box.skip(metr.box_above_line) - box.draw.line((metr.box_line_left, box.height, metr.box_line_right, box.height), - fill=self.author_color, width=metr.box_line_width) - box.skip(metr.box_below_line) - - title_font = ImageFont.truetype( - self.title_font_ttf, metr.title_font_size) - box.text(self.pretty_title(), - line_height=metr.title_lineskip, - font=title_font, - color=epoch_color, - shadow_color=self.title_shadow, - ) - - if self.with_logo: - logo = Image.open(get_resource('res/wl-logo-mono.png')) - logo = logo.resize((metr.logo_width, logo.size[1] * metr.logo_width / logo.size[0]), Image.ANTIALIAS) - alpha = logo.split()[3] - alpha = ImageEnhance.Brightness(alpha).enhance(.75) - logo.putalpha(alpha) - box.skip(metr.logo_top + logo.size[1]) - - box_img = box.image() - - if self.kind == 'Liryka': - # top - box_top = metr.box_top_margin - elif self.kind == 'Epika': - # bottom - box_top = metr.height - metr.box_bottom_margin - box_img.size[1] - else: - # center - box_top = (metr.height - box_img.size[1]) / 2 - - box_left = metr.bar_width + (metr.width - metr.bar_width - - box_img.size[0]) / 2 - draw.rectangle((box_left, box_top, - box_left + box_img.size[0], box_top + box_img.size[1]), - fill='#fff') - img.paste(box_img, (box_left, box_top), box_img) - - if self.with_logo: - img.paste(logo, - (box_left + (box_img.size[0] - logo.size[0]) / 2, - box_top + box_img.size[1] - metr.box_padding_y - logo.size[1]), mask=logo) - - return img - - -class VirtualoCover(Cover): - width = 600 - height = 730 - author_top = 73 - title_top = 73 - logo_bottom = 25 - logo_width = 250 - - -class PrestigioCover(Cover): - width = 580 - height = 783 - background_img = get_resource('res/cover-prestigio.png') - - author_top = 446 - author_margin_left = 118 - author_margin_right = 62 - author_lineskip = 60 - author_color = '#fff' - author_shadow = '#000' - author_font_ttf = get_resource('fonts/JunicodeWL-Italic.ttf') - author_font_size = 50 - - title_top = 0 - title_margin_left = 118 - title_margin_right = 62 - title_lineskip = 60 - title_color = '#fff' - title_shadow = '#000' - title_font_ttf = get_resource('fonts/JunicodeWL-Italic.ttf') - title_font_size = 50 - - def pretty_title(self): - return u"„%s”" % self.title - - -class BookotekaCover(Cover): - width = 2140 - height = 2733 - background_img = get_resource('res/cover-bookoteka.png') - - author_top = 480 - author_margin_left = 307 - author_margin_right = 233 - author_lineskip = 156 - author_color = '#d9d919' - author_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') - author_font_size = 130 - - title_top = 400 - title_margin_left = 307 - title_margin_right = 233 - title_lineskip = 168 - title_color = '#d9d919' - title_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') - title_font_size = 140 - - format = 'PNG' - - -class GandalfCover(Cover): - width = 600 - height = 730 - background_img = get_resource('res/cover-gandalf.png') - author_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') - author_font_size = 30 - title_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') - title_font_size = 40 - logo_bottom = 25 - logo_width = 250 - format = 'PNG' diff --git a/librarian/document.py b/librarian/document.py new file mode 100755 index 0000000..32148e3 --- /dev/null +++ b/librarian/document.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from StringIO import StringIO +from lxml import etree +from . import SSTNS +from .core import Section +from .parser import SSTParser + + +class Document(object): + # Do I use meta_context? + def __init__(self, edoc, meta_context=None): + self.edoc = edoc + + root_elem = edoc.getroot() + if meta_context is not None: + root_elem.meta_context = meta_context + + if not isinstance(root_elem, Section): + if root_elem.tag != SSTNS('section'): + raise ValidationError("Invalid root element. Found '%s', should be '%s'" % ( + root_elem.tag, SSTNS('section'))) + else: + raise ValidationError("Invalid class of root element. " + "Use librarian.parser.SSTParser.") + + @classmethod + def from_string(cls, xml, *args, **kwargs): + return cls.from_file(StringIO(xml), *args, **kwargs) + + @classmethod + def from_file(cls, xmlfile, *args, **kwargs): + # first, prepare for parsing + if isinstance(xmlfile, basestring): + file = open(xmlfile, 'rb') + try: + data = file.read() + finally: + file.close() + else: + data = xmlfile.read() + + if not isinstance(data, unicode): + data = data.decode('utf-8') + + data = data.replace(u'\ufeff', '') + + parser = SSTParser() + tree = etree.parse(StringIO(data.encode('utf-8')), parser) + tree.xinclude() + return cls(tree, *args, **kwargs) + + @property + def meta(self): + """ Document's metadata is root's metadata. """ + return self.edoc.getroot().meta diff --git a/librarian/epub/cover.html b/librarian/epub/cover.html deleted file mode 100644 index 784067c..0000000 --- a/librarian/epub/cover.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - - Okładka - - - -
- Okładka -
- - \ No newline at end of file diff --git a/librarian/formats/__init__.py b/librarian/formats/__init__.py new file mode 100644 index 0000000..cfe4fc2 --- /dev/null +++ b/librarian/formats/__init__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +class Format(object): + """ Generic format class. """ + def __init__(self, doc): + self.doc = doc + + def build(self): + raise NotImplementedError diff --git a/librarian/formats/cover/__init__.py b/librarian/formats/cover/__init__.py new file mode 100644 index 0000000..7a787e8 --- /dev/null +++ b/librarian/formats/cover/__init__.py @@ -0,0 +1,219 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +import re +from PIL import Image, ImageFont, ImageDraw, ImageFilter, ImageEnhance +from StringIO import StringIO +from librarian import DCNS, URLOpener +from librarian.output import OutputFile +from librarian.utils import get_resource +from librarian.formats import Format + + +class Metric(object): + """Gets metrics from an object, scaling it by a factor.""" + def __init__(self, obj, scale): + self._obj = obj + self._scale = float(scale) + + def __getattr__(self, name): + src = getattr(self._obj, name) + if src and self._scale: + src = type(src)(self._scale * src) + return src + + +class TextBox(object): + """Creates an Image with a series of centered strings.""" + + SHADOW_X = 3 + SHADOW_Y = 3 + SHADOW_BLUR = 3 + + def __init__(self, max_width, max_height, padding_x=None, padding_y=None): + if padding_x is None: + padding_x = self.SHADOW_X + self.SHADOW_BLUR + if padding_y is None: + padding_y = self.SHADOW_Y + self.SHADOW_BLUR + + self.max_width = max_width + self.max_text_width = max_width - 2 * padding_x + self.padding_y = padding_y + self.height = padding_y + self.img = Image.new('RGBA', (max_width, max_height)) + self.draw = ImageDraw.Draw(self.img) + self.shadow_img = None + self.shadow_draw = None + + def skip(self, height): + """Skips some vertical space.""" + self.height += height + + def text(self, text, color='#000', font=None, line_height=20, + shadow_color=None): + """Writes some centered text.""" + text = re.sub(r'\s+', ' ', text) + if shadow_color: + if not self.shadow_img: + self.shadow_img = Image.new('RGBA', self.img.size) + self.shadow_draw = ImageDraw.Draw(self.shadow_img) + while text: + line = text + line_width = self.draw.textsize(line, font=font)[0] + while line_width > self.max_text_width: + parts = line.rsplit(' ', 1) + if len(parts) == 1: + line_width = self.max_text_width + break + line = parts[0] + line_width = self.draw.textsize(line, font=font)[0] + line = line.strip() + ' ' + + pos_x = (self.max_width - line_width) / 2 + + if shadow_color: + self.shadow_draw.text( + (pos_x + self.SHADOW_X, self.height + self.SHADOW_Y), + line, font=font, fill=shadow_color + ) + + self.draw.text((pos_x, self.height), line, font=font, fill=color) + self.height += line_height + # go to next line + text = text[len(line):] + + def image(self): + """Creates the actual Image object.""" + image = Image.new('RGBA', (self.max_width, + self.height + self.padding_y)) + if self.shadow_img: + shadow = self.shadow_img.filter(ImageFilter.BLUR) + image.paste(shadow, (0, 0), shadow) + image.paste(self.img, (0, 0), self.img) + else: + image.paste(self.img, (0, 0)) + return image + + +class Cover(Format): + """Base class for cover images generator.""" + format_name = u"cover image" + + width = 600 + height = 800 + background_color = '#fff' + background_img = None + + author_top = 100 + author_margin_left = 20 + author_margin_right = 20 + author_lineskip = 40 + author_color = '#000' + author_shadow = None + author_font_ttf = get_resource('fonts/DejaVuSerif.ttf') + author_font_size = 30 + + title_top = 100 + title_margin_left = 20 + title_margin_right = 20 + title_lineskip = 54 + title_color = '#000' + title_shadow = None + title_font_ttf = get_resource('fonts/DejaVuSerif.ttf') + title_font_size = 40 + + logo_bottom = None + logo_width = None + uses_dc_cover = False + + format = 'JPEG' + scale = 1 + + exts = { + 'JPEG': 'jpg', + 'PNG': 'png', + } + + mime_types = { + 'JPEG': 'image/jpeg', + 'PNG': 'image/png', + } + + def __init__(self, doc, format=None, width=None, height=None): + self.author = ", ".join(auth for auth in doc.meta.get(DCNS('creator'))) + self.title = doc.meta.title() + if format is not None: + self.format = format + scale = max(float(width or 0) / self.width, float(height or 0) / self.height) + if scale: + self.scale = scale + + def pretty_author(self): + """Allows for decorating author's name.""" + return self.author + + def pretty_title(self): + """Allows for decorating title.""" + return self.title + + def image(self): + metr = Metric(self, self.scale) + img = Image.new('RGB', (metr.width, metr.height), self.background_color) + + if self.background_img: + background = Image.open(self.background_img) + img.paste(background, None, background) + del background + + # WL logo + if metr.logo_width: + logo = Image.open(get_resource('res/wl-logo.png')) + logo = logo.resize((metr.logo_width, logo.size[1] * metr.logo_width / logo.size[0])) + img.paste(logo, ((metr.width - metr.logo_width) / 2, img.size[1] - logo.size[1] - metr.logo_bottom)) + + top = metr.author_top + tbox = TextBox( + metr.width - metr.author_margin_left - metr.author_margin_right, + metr.height - top, + ) + + author_font = ImageFont.truetype( + self.author_font_ttf, metr.author_font_size) + tbox.text(self.pretty_author(), self.author_color, author_font, + metr.author_lineskip, self.author_shadow) + text_img = tbox.image() + img.paste(text_img, (metr.author_margin_left, top), text_img) + + top += text_img.size[1] + metr.title_top + tbox = TextBox( + metr.width - metr.title_margin_left - metr.title_margin_right, + metr.height - top, + ) + title_font = ImageFont.truetype( + self.title_font_ttf, metr.title_font_size) + tbox.text(self.pretty_title(), self.title_color, title_font, + metr.title_lineskip, self.title_shadow) + text_img = tbox.image() + img.paste(text_img, (metr.title_margin_left, top), text_img) + + return img + imgstr = StringIO() + img.save(imgstr, format=self.format, quality=95) + OutputFile.from_string(imgstr.getvalue()) + + def mime_type(self): + return self.mime_types[self.format] + + @property + def format_ext(self): + return self.exts[self.format] + + def save(self, *args, **kwargs): + return self.image().save(format=self.format, quality=95, *args, **kwargs) + + def build(self, *args, **kwargs): + imgstr = StringIO() + self.save(imgstr, *args, **kwargs) + return OutputFile.from_string(imgstr.getvalue()) diff --git a/librarian/formats/cover/partners/__init__.py b/librarian/formats/cover/partners/__init__.py new file mode 100644 index 0000000..2d8a663 --- /dev/null +++ b/librarian/formats/cover/partners/__init__.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from librarian.utils import get_resource +from .. import Cover + + +class VirtualoCover(Cover): + format_name = u"Virtualo cover image" + + width = 600 + height = 730 + author_top = 73 + title_top = 73 + logo_bottom = 25 + logo_width = 250 + + +class PrestigioCover(Cover): + format_name = u"Prestigio cover image" + + width = 580 + height = 783 + background_img = get_resource('res/cover-prestigio.png') + + author_top = 446 + author_margin_left = 118 + author_margin_right = 62 + author_lineskip = 60 + author_color = '#fff' + author_shadow = '#000' + author_font_ttf = get_resource('fonts/JunicodeWL-Italic.ttf') + author_font_size = 50 + + title_top = 0 + title_margin_left = 118 + title_margin_right = 62 + title_lineskip = 60 + title_color = '#fff' + title_shadow = '#000' + title_font_ttf = get_resource('fonts/JunicodeWL-Italic.ttf') + title_font_size = 50 + + def pretty_title(self): + return u"„%s”" % self.title + + +class BookotekaCover(Cover): + format_name = u"Bookoteka cover image" + + width = 2140 + height = 2733 + background_img = get_resource('res/cover-bookoteka.png') + + author_top = 480 + author_margin_left = 307 + author_margin_right = 233 + author_lineskip = 156 + author_color = '#d9d919' + author_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') + author_font_size = 130 + + title_top = 400 + title_margin_left = 307 + title_margin_right = 233 + title_lineskip = 168 + title_color = '#d9d919' + title_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') + title_font_size = 140 + + format = 'PNG' + + +class GandalfCover(Cover): + format_name = u"Gandalf cover image" + + width = 600 + height = 730 + background_img = get_resource('res/cover-gandalf.png') + author_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') + author_font_size = 30 + title_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') + title_font_size = 40 + logo_bottom = 25 + logo_width = 250 + format = 'PNG' diff --git a/librarian/formats/cover/wolnelektury/__init__.py b/librarian/formats/cover/wolnelektury/__init__.py new file mode 100644 index 0000000..4218770 --- /dev/null +++ b/librarian/formats/cover/wolnelektury/__init__.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from PIL import Image, ImageFont, ImageDraw +from librarian.utils import get_resource +from .. import Cover, Metric, TextBox + + +class WLCover(Cover): + """Default Wolne Lektury cover generator.""" + format_name = u"WL-style cover image" + + width = 600 + height = 833 + uses_dc_cover = True + author_font_ttf = get_resource('fonts/JunicodeWL-Regular.ttf') + author_font_size = 20 + author_lineskip = 30 + title_font_ttf = get_resource('fonts/DejaVuSerif-Bold.ttf') + title_font_size = 30 + title_lineskip = 40 + title_box_width = 350 + + box_top_margin = 100 + box_bottom_margin = 100 + box_padding_y = 20 + box_above_line = 10 + box_below_line = 15 + box_line_left = 75 + box_line_right = 275 + box_line_width = 2 + + logo_top = 15 + logo_width = 140 + + bar_width = 35 + background_color = '#444' + author_color = '#444' + default_background = get_resource('res/cover.png') + format = 'JPEG' + + epoch_colors = { + u'Starożytność': '#9e3610', + u'Średniowiecze': '#564c09', + u'Renesans': '#8ca629', + u'Barok': '#a6820a', + u'Oświecenie': '#f2802e', + u'Romantyzm': '#db4b16', + u'Pozytywizm': '#961060', + u'Modernizm': '#7784e0', + u'Dwudziestolecie międzywojenne': '#3044cf', + u'Współczesność': '#06393d', + } + + def __init__(self, doc, format=None, width=None, height=None, with_logo=False): + super(WLCover, self).__init__(doc, format=format, width=width, height=height) + self.kind = doc.meta.get_one('kind') + self.epoch = doc.meta.get_one('epoch') + self.with_logo = with_logo + # TODO + if doc.meta.get('cover_url'): + url = doc.meta.get('cover_url')[0] + bg_src = None + if bg_src is None: + bg_src = URLOpener().open(url) + self.background_img = StringIO(bg_src.read()) + bg_src.close() + else: + self.background_img = self.default_background + + def pretty_author(self): + return self.author.upper() + + def image(self): + metr = Metric(self, self.scale) + img = Image.new('RGB', (metr.width, metr.height), self.background_color) + draw = ImageDraw.Draw(img) + + if self.epoch in self.epoch_colors: + epoch_color = self.epoch_colors[self.epoch] + else: + epoch_color = '#000' + draw.rectangle((0, 0, metr.bar_width, metr.height), fill=epoch_color) + + if self.background_img: + src = Image.open(self.background_img) + trg_size = (metr.width - metr.bar_width, metr.height) + if src.size[0] * trg_size[1] < src.size[1] * trg_size[0]: + resized = ( + trg_size[0], + src.size[1] * trg_size[0] / src.size[0] + ) + cut = (resized[1] - trg_size[1]) / 2 + src = src.resize(resized, Image.ANTIALIAS) + src = src.crop((0, cut, src.size[0], src.size[1] - cut)) + else: + resized = ( + src.size[0] * trg_size[1] / src.size[1], + trg_size[1], + ) + cut = (resized[0] - trg_size[0]) / 2 + src = src.resize(resized, Image.ANTIALIAS) + src = src.crop((cut, 0, src.size[0] - cut, src.size[1])) + + img.paste(src, (metr.bar_width, 0)) + del src + + box = TextBox(metr.title_box_width, metr.height, padding_y=metr.box_padding_y) + author_font = ImageFont.truetype( + self.author_font_ttf, metr.author_font_size) + box.text(self.pretty_author(), + font=author_font, + line_height=metr.author_lineskip, + color=self.author_color, + shadow_color=self.author_shadow, + ) + + box.skip(metr.box_above_line) + box.draw.line((metr.box_line_left, box.height, metr.box_line_right, box.height), + fill=self.author_color, width=metr.box_line_width) + box.skip(metr.box_below_line) + + title_font = ImageFont.truetype( + self.title_font_ttf, metr.title_font_size) + box.text(self.pretty_title(), + line_height=metr.title_lineskip, + font=title_font, + color=epoch_color, + shadow_color=self.title_shadow, + ) + + if self.with_logo: + logo = Image.open(get_resource('res/wl-logo-mono.png')) + logo = logo.resize((metr.logo_width, logo.size[1] * metr.logo_width / logo.size[0]), Image.ANTIALIAS) + alpha = logo.split()[3] + alpha = ImageEnhance.Brightness(alpha).enhance(.75) + logo.putalpha(alpha) + box.skip(metr.logo_top + logo.size[1]) + + box_img = box.image() + + if self.kind == 'Liryka': + # top + box_top = metr.box_top_margin + elif self.kind == 'Epika': + # bottom + box_top = metr.height - metr.box_bottom_margin - box_img.size[1] + else: + # center + box_top = (metr.height - box_img.size[1]) / 2 + + box_left = metr.bar_width + (metr.width - metr.bar_width - + box_img.size[0]) / 2 + draw.rectangle((box_left, box_top, + box_left + box_img.size[0], box_top + box_img.size[1]), + fill='#fff') + img.paste(box_img, (box_left, box_top), box_img) + + if self.with_logo: + img.paste(logo, + (box_left + (box_img.size[0] - logo.size[0]) / 2, + box_top + box_img.size[1] - metr.box_padding_y - logo.size[1]), mask=logo) + + return img diff --git a/librarian/formats/epub/__init__.py b/librarian/formats/epub/__init__.py new file mode 100644 index 0000000..f9f7565 --- /dev/null +++ b/librarian/formats/epub/__init__.py @@ -0,0 +1,279 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +import os +from copy import deepcopy +from tempfile import NamedTemporaryFile +import zipfile +from lxml import etree +from librarian import OPFNS, NCXNS, XHTMLNS +from librarian import core +from librarian.formats import Format +from librarian.formats.cover.wolnelektury import WLCover +from librarian.output import OutputFile +from librarian.renderers import Register, TreeRenderer, UnknownElement +from librarian.utils import Context, get_resource, extend_element + + +class EpubFormat(Format): + format_name = 'EPUB' + format_ext = 'epub' + + cover = WLCover + renderers = Register() + + def __init__(self, doc, cover=None, with_fonts=True): + super(EpubFormat, self).__init__(doc) + self.with_fonts = with_fonts + if cover is not None: + self.cover = cover + + def build(self): + opf = etree.parse(get_resource('formats/epub/res/content.opf')) + manifest = opf.find(OPFNS('manifest')) + guide = opf.find(OPFNS('guide')) + spine = opf.find(OPFNS('spine')) + + output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False) + zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) + + mime = zipfile.ZipInfo() + mime.filename = 'mimetype' + mime.compress_type = zipfile.ZIP_STORED + mime.extra = '' + zip.writestr(mime, 'application/epub+zip') + zip.writestr('META-INF/container.xml', '' \ + '' \ + '') + + toc_file = etree.fromstring('' \ + '' \ + '') + nav_map = toc_file[-1] + + if self.cover is not None: + cover = self.cover(self.doc) + cover_output = cover.build() + cover_name = 'cover.%s' % cover.format_ext + zip.writestr(os.path.join('OPS', cover_name), cover_output.get_string()) + del cover_output + + cover_tree = etree.parse(get_resource('formats/epub/res/cover.html')) + cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name) + zip.writestr('OPS/cover.html', etree.tostring( + cover_tree, method="html", pretty_print=True)) + + if cover.uses_dc_cover: + if self.doc.meta.get_one('cover_by'): + document.edoc.getroot().set('data-cover-by', self.doc.meta.get_one('cover_by')) + if self.doc.meta.get_one('cover_source'): + document.edoc.getroot().set('data-cover-source', self.doc.meta.get_one('cover_source')) + + manifest.append(etree.fromstring( + '')) + manifest.append(etree.fromstring( + '' % (cover_name, cover.mime_type()))) + spine.insert(0, etree.fromstring('')) + opf.getroot()[0].append(etree.fromstring('')) + guide.append(etree.fromstring('')) + + + ctx = Context(format=self) + ctx.toc = TOC() + ctx.toc_level = 0 + ctx.footnotes = Footnotes() + ctx.part_no = 0 + + wrap_tmpl = etree.parse(get_resource('formats/epub/res/chapter.html')) + for e in self.render(self.doc.edoc.getroot(), ctx): + if not len(e) and not e.text.strip(): + continue + wrap = deepcopy(wrap_tmpl) + extend_element(wrap.find('//*[@id="book-text"]'), e) + + partstr = 'part%d' % int(e.get('part_no')) + manifest.append(manifest.makeelement(OPFNS('item'), attrib={ + 'id': partstr, + 'href': partstr + ".html", + 'media-type': 'application/xhtml+xml', + })) + spine.append(spine.makeelement(OPFNS('itemref'), attrib={ + 'idref': partstr, + })) + zip.writestr('OPS/%s.html' % partstr, etree.tostring(wrap, method='html')) + + if len(ctx.footnotes.output): + ctx.toc.add("Przypisy", "footnotes.html") + manifest.append(etree.Element(OPFNS('item'), + id='footnotes', href='footnotes.html', + **{'media-type': "application/xhtml+xml"})) + spine.append(etree.Element('itemref', idref='footnotes')) + wrap = etree.parse(get_resource('formats/epub/res/footnotes.html')) + extend_element(wrap.find('//*[@id="footnotes"]'), ctx.footnotes.output) + + #chars = chars.union(used_chars(html_tree.getroot())) + zip.writestr('OPS/footnotes.html', etree.tostring( + wrap, method="html", pretty_print=True)) + + + zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True)) + ctx.toc.render(toc_file[-1]) + zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True)) + zip.close() + return OutputFile.from_filename(output_file.name) + + def render(self, element, ctx): + return self.renderers.get_for(element).render(element, ctx) + + +# Helpers + +class EpubRenderer(TreeRenderer): + """ Renders insides as XML in a <_/> container. """ + def container(self, ctx): + root, inner = super(EpubRenderer, self).container() + root.set("part_no", str(ctx.part_no)) + return root, inner + + def render(self, element, ctx): + subctx = self.subcontext(element, ctx) + wrapper, inside = self.container(ctx) + if element.text: + extend_element(inside, self.render_text(element.text, ctx)) + for child in element: + try: + child_renderer = ctx.format.renderers.get_for(child) + except UnknownElement: + continue + else: + if getattr(child_renderer, 'epub_separate', False): + yield wrapper + ctx.part_no += 1 + for child_part in child_renderer.render(child, subctx): + yield child_part + wrapper, inside = self.container(ctx) + else: + child_parts = list(child_renderer.render(child, subctx)) + extend_element(inside, child_parts[0]) + if len(child_parts) > 1: + yield wrapper + for child_part in child_parts[1:-1]: + yield child_part + wrapper, inside = self.container(ctx) + extend_element(inside, child_parts[-1]) + finally: + if child.tail: + extend_element(inside, self.render_text(child.tail, ctx)) + yield wrapper + + +class Footnotes(object): + def __init__(self): + self.counter = 0 + self.output = etree.Element("_") + + def append(self, items): + self.counter += 1 + e = etree.Element("a", + href="part%d.html#footnote-anchor-%d" % (int(items[0].get('part_no')), self.counter), + id="footnote-%d" % self.counter, + style="float:left;margin-right:1em") + e.text = "[%d]" % self.counter + e.tail = " " + self.output.append(e) + for item in items: + extend_element(self.output, item) + anchor = etree.Element("a", + id="footnote-anchor-%d" % self.counter, + href="footnotes.html#footnote-%d" % self.counter) + anchor.text = "[%d]" % self.counter + return anchor + + +class TOC(object): + def __init__(self, title=None, href="", root=None): + if root is None: + self.counter = 0 + self.root = self + else: + self.root = root + self.children = [] + self.title = title + self.href = href.format(counter=self.root.counter) + self.number = self.root.counter + self.root.counter += 1 + + def add(self, title, href): + subtoc = type(self)(title, href, root=self.root) + self.children.append(subtoc) + return subtoc + + def render(self, nav_map): + for child in self.children: + nav_point = etree.Element(NCXNS('navPoint')) + nav_point.set('id', 'NavPoint-%d' % child.number) + nav_point.set('playOrder', str(child.number)) + + nav_label = etree.Element(NCXNS('navLabel')) + text = etree.Element(NCXNS('text')) + text.text = child.title + nav_label.append(text) + nav_point.append(nav_label) + + content = etree.Element(NCXNS('content')) + content.set('src', child.href) + nav_point.append(content) + nav_map.append(nav_point) + child.render(nav_map) + + +# Renderers + +class AsideR(EpubRenderer): + def render(self, element, ctx): + outputs = list(super(AsideR, self).render(element, ctx)) + anchor = ctx.footnotes.append(outputs) + wrapper, inside = self.text_container() #etree.Element('_', part_no=str(ctx.part_no)) + inside.append(anchor) + yield wrapper +EpubFormat.renderers.register(core.Aside, None, AsideR('div')) + + +class DivR(EpubRenderer): + def container(self, ctx): + root, inner = super(DivR, self).container(ctx) + if getattr(ctx, 'inline', False): + inner.tag = 'span' + inner.set('style', 'display: block;') + return root, inner +EpubFormat.renderers.register(core.Div, None, DivR('div')) + + +class HeaderR(EpubRenderer): + def subcontext(self, element, ctx): + return Context(ctx, inline=True) +EpubFormat.renderers.register(core.Header, None, HeaderR('h1')) + + +class SectionR(EpubRenderer): + epub_separate = True + + def render(self, element, ctx): + # Add 'poczatek'? + if element.getparent() is not None: + tocitem = ctx.toc.add(element.meta.title(), 'part%d.html' % ctx.part_no) + ctx = Context(ctx, toc=tocitem) + return super(SectionR, self).render(element, ctx) +EpubFormat.renderers.register(core.Section, None, SectionR()) + + +class SpanR(EpubRenderer): + pass +EpubFormat.renderers.register(core.Span, None, SpanR('span')) + diff --git a/librarian/formats/epub/res/chapter.html b/librarian/formats/epub/res/chapter.html new file mode 100644 index 0000000..342d5df --- /dev/null +++ b/librarian/formats/epub/res/chapter.html @@ -0,0 +1,12 @@ + + + + + + WolneLektury.pl + + + +
+ + diff --git a/librarian/formats/epub/res/content.opf b/librarian/formats/epub/res/content.opf new file mode 100644 index 0000000..df95a3a --- /dev/null +++ b/librarian/formats/epub/res/content.opf @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/librarian/formats/epub/res/cover.html b/librarian/formats/epub/res/cover.html new file mode 100644 index 0000000..784067c --- /dev/null +++ b/librarian/formats/epub/res/cover.html @@ -0,0 +1,13 @@ + + + + + Okładka + + + +
+ Okładka +
+ + \ No newline at end of file diff --git a/librarian/formats/epub/res/footnotes.html b/librarian/formats/epub/res/footnotes.html new file mode 100644 index 0000000..b3b868c --- /dev/null +++ b/librarian/formats/epub/res/footnotes.html @@ -0,0 +1,17 @@ + + + + + + Przypisy + + + +
+

+ Przypisy: +

+
+
+ + diff --git a/librarian/formats/html/__init__.py b/librarian/formats/html/__init__.py new file mode 100644 index 0000000..ddf2c78 --- /dev/null +++ b/librarian/formats/html/__init__.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +import re +from lxml import etree +from librarian.formats import Format +from librarian.output import OutputFile +from librarian.renderers import Register, TreeRenderer +from librarian.utils import Context, get_resource +from librarian import core + + +class HtmlFormat(Format): + format_name = 'HTML' + format_ext = 'html' + + renderers = Register() + + def __init__(self, doc, standalone=False): + super(HtmlFormat, self).__init__(doc) + self.standalone = standalone + + def build(self): + if self.standalone: + tmpl = get_resource("formats/html/res/html_standalone.html") + else: + tmpl = get_resource("formats/html/res/html.html") + t = etree.parse(tmpl) + + ctx = Context(format=self) + ctx.toc = TOC() + ctx.toc_level = 0 + ctx.footnotes = Footnotes() + + if self.standalone: + t.find('head/title').text = u"%s (%s)" % (self.doc.meta.title(), self.doc.meta.author()) + + t.find('.//div[@id="content"]').extend( + self.render(self.doc.edoc.getroot(), ctx)) + t.find('.//div[@id="toc"]').append(ctx.toc.render()) + t.find('.//div[@id="footnotes"]').extend(ctx.footnotes.output) + + return OutputFile.from_string(etree.tostring( + t, encoding='utf-8', method="html")) + + def render(self, element, ctx): + return self.renderers.get_for(element).render(element, ctx) + + +# Helpers + +class NaturalText(TreeRenderer): + def render_text(self, text, ctx): + root, inner = self.text_container() + chunks = re.split('(?<=\s\w) ', text) + inner.text = chunks[0] + for chunk in chunks[1:]: + x = etree.Entity("nbsp") + x.tail = chunk + inner.append(x) + return root + + +class LiteralText(TreeRenderer): + pass + + +class Footnotes(object): + def __init__(self): + self.counter = 0 + self.output = etree.Element("_") + + def append(self, item): + self.counter += 1 + e = etree.Element("a", + href="#footnote-anchor-%d" % self.counter, + id="footnote-%d" % self.counter, + style="float:left;margin-right:1em") + e.text = "[%d]" % self.counter + e.tail = " " + self.output.append(e) + self.output.extend(item) + anchor = etree.Element("a", + id="footnote-anchor-%d" % self.counter, + href="#footnote-%d" % self.counter) + anchor.text = "[%d]" % self.counter + return anchor + + +class TOC(object): + def __init__(self): + self.items = [] + self.counter = 0 + + def add(self, title, level=0): + self.counter += 1 + self.items.append((level, title, self.counter)) + return self.counter + + def render(self): + out = etree.Element("ul", id="toc") + curr_level = 0 + cursor = out + for level, title, counter in self.items: + while level > curr_level: + ins = etree.Element("ul") + cursor.append(ins) + cursor = ins + curr_level += 1 + while level < curr_level: + cursor = cursor.getparent() + curr_level -= 1 + ins = etree.Element("li") + ins.append(etree.Element("a", href="#sect%d" % counter)) + ins[0].text = title + cursor.append(ins) + return out + + +# Renderers + +HtmlFormat.renderers.register(core.Aside, None, NaturalText('aside')) + +class AsideFootnote(NaturalText): + def render(self, element, ctx): + output = super(AsideFootnote, self).render(element, ctx) + anchor = ctx.footnotes.append(output) + root, inner = self.container() + inner.append(anchor) + return root +HtmlFormat.renderers.register(core.Aside, 'footnote', AsideFootnote()) + + +HtmlFormat.renderers.register(core.Header, None, NaturalText('h1')) + + +HtmlFormat.renderers.register(core.Div, None, NaturalText('div')) +HtmlFormat.renderers.register(core.Div, 'item', NaturalText('li')) +HtmlFormat.renderers.register(core.Div, 'list', NaturalText('ul')) +HtmlFormat.renderers.register(core.Div, 'p', NaturalText('p')) + + +class Section(NaturalText): + def subcontext(self, element, ctx): + return Context(ctx, toc_level=ctx.toc_level + 1) + + def render(self, element, ctx): + counter = ctx.toc.add(element.meta.title(), ctx.toc_level) + root = super(Section, self).render(element, ctx) + root[0].set("id", "sect%d" % counter) + return root +HtmlFormat.renderers.register(core.Section, None, Section('section')) + + +HtmlFormat.renderers.register(core.Span, None, NaturalText('span')) +HtmlFormat.renderers.register(core.Span, 'cite', NaturalText('cite')) +HtmlFormat.renderers.register(core.Span, 'cite.code', LiteralText('code')) +HtmlFormat.renderers.register(core.Span, 'emph', NaturalText('em')) + +class SpanUri(LiteralText): + def render(self, element, ctx): + root = super(SpanUri, self).render(element, ctx) + root[0].attrib['href'] = element.text + return root +HtmlFormat.renderers.register(core.Span, 'uri', SpanUri('a')) diff --git a/librarian/formats/html/res/html.html b/librarian/formats/html/res/html.html new file mode 100644 index 0000000..a6e6314 --- /dev/null +++ b/librarian/formats/html/res/html.html @@ -0,0 +1,8 @@ +
+
+
+
+
+
+
+
diff --git a/librarian/formats/html/res/html_standalone.html b/librarian/formats/html/res/html_standalone.html new file mode 100644 index 0000000..a6b6213 --- /dev/null +++ b/librarian/formats/html/res/html_standalone.html @@ -0,0 +1,15 @@ + + + + + + + +
+
+
+
+
+
+ + diff --git a/librarian/functions.py b/librarian/functions.py deleted file mode 100644 index 523b3d5..0000000 --- a/librarian/functions.py +++ /dev/null @@ -1,106 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. -# -from lxml import etree -import re - -from librarian.dcparser import Person - -def _register_function(f): - """ Register extension function with lxml """ - ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') - ns[f.__name__] = f - - -def reg_substitute_entities(): - ENTITY_SUBSTITUTIONS = [ - (u'---', u'—'), - (u'--', u'–'), - (u'...', u'…'), - (u',,', u'„'), - (u'"', u'”'), - ] - - def substitute_entities(context, text): - """XPath extension function converting all entites in passed text.""" - if isinstance(text, list): - text = ''.join(text) - for entity, substitutution in ENTITY_SUBSTITUTIONS: - text = text.replace(entity, substitutution) - return text - - _register_function(substitute_entities) - - -def reg_strip(): - def strip(context, text): - """Remove unneeded whitespace from beginning and end""" - if isinstance(text, list): - text = ''.join(text) - return re.sub(r'\s+', ' ', text).strip() - _register_function(strip) - - -def reg_starts_white(): - def starts_white(context, text): - if isinstance(text, list): - text = ''.join(text) - if not text: - return False - return text[0].isspace() - _register_function(starts_white) - - -def reg_ends_white(): - def ends_white(context, text): - if isinstance(text, list): - text = ''.join(text) - if not text: - return False - return text[-1].isspace() - _register_function(ends_white) - - -def reg_wrap_words(): - def wrap_words(context, text, wrapping): - """XPath extension function automatically wrapping words in passed text""" - if isinstance(text, list): - text = ''.join(text) - if not wrapping: - return text - - words = re.split(r'\s', text) - - line_length = 0 - lines = [[]] - for word in words: - line_length += len(word) + 1 - if line_length > wrapping: - # Max line length was exceeded. We create new line - lines.append([]) - line_length = len(word) - lines[-1].append(word) - return '\n'.join(' '.join(line) for line in lines) - _register_function(wrap_words) - - -def reg_person_name(): - def person_name(context, text): - """ Converts "Name, Forename" to "Forename Name" """ - if isinstance(text, list): - text = ''.join(text) - return Person.from_text(text).readable() - _register_function(person_name) - - -def reg_texcommand(): - def texcommand(context, text): - """Remove non-letters""" - if isinstance(text, list): - text = ''.join(text) - return re.sub(r'[^a-zA-Z]', '', text).strip() - _register_function(texcommand) - - diff --git a/librarian/html.py b/librarian/html.py deleted file mode 100644 index c1a5e5b..0000000 --- a/librarian/html.py +++ /dev/null @@ -1,279 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. -# -import os -import cStringIO -import copy - -from lxml import etree -from librarian import XHTMLNS, ParseError, OutputFile -from librarian import functions - -from lxml.etree import XMLSyntaxError, XSLTApplyError - -functions.reg_substitute_entities() -functions.reg_person_name() - -STYLESHEETS = { - 'legacy': 'xslt/book2html.xslt', - 'full': 'xslt/wl2html_full.xslt', - 'partial': 'xslt/wl2html_partial.xslt' -} - -def get_stylesheet(name): - return os.path.join(os.path.dirname(__file__), STYLESHEETS[name]) - -def html_has_content(text): - return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text) - -def transform(wldoc, stylesheet='legacy', options=None, flags=None): - """Transforms the WL document to XHTML. - - If output_filename is None, returns an XML, - otherwise returns True if file has been written,False if it hasn't. - File won't be written if it has no content. - """ - # Parse XSLT - try: - style_filename = get_stylesheet(stylesheet) - style = etree.parse(style_filename) - - document = copy.deepcopy(wldoc) - del wldoc - document.swap_endlines() - - if flags: - for flag in flags: - document.edoc.getroot().set(flag, 'yes') - - document.clean_ed_note() - - if not options: - options = {} - result = document.transform(style, **options) - del document # no longer needed large object :) - - if html_has_content(result): - add_anchors(result.getroot()) - add_table_of_contents(result.getroot()) - - return OutputFile.from_string(etree.tostring(result, method='html', - xml_declaration=False, pretty_print=True, encoding='utf-8')) - else: - return None - except KeyError: - raise ValueError("'%s' is not a valid stylesheet.") - except (XMLSyntaxError, XSLTApplyError), e: - raise ParseError(e) - -class Fragment(object): - def __init__(self, id, themes): - super(Fragment, self).__init__() - self.id = id - self.themes = themes - self.events = [] - - def append(self, event, element): - self.events.append((event, element)) - - def closed_events(self): - stack = [] - for event, element in self.events: - if event == 'start': - stack.append(('end', element)) - elif event == 'end': - try: - stack.pop() - except IndexError: - print 'CLOSED NON-OPEN TAG:', element - - stack.reverse() - return self.events + stack - - def to_string(self): - result = [] - for event, element in self.closed_events(): - if event == 'start': - result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) - if element.text: - result.append(element.text) - elif event == 'end': - result.append(u'' % element.tag) - if element.tail: - result.append(element.tail) - else: - result.append(element) - - return ''.join(result) - - def __unicode__(self): - return self.to_string() - - -def extract_fragments(input_filename): - """Extracts theme fragments from input_filename.""" - open_fragments = {} - closed_fragments = {} - - # iterparse would die on a HTML document - parser = etree.HTMLParser(encoding='utf-8') - buf = cStringIO.StringIO() - buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8')) - buf.seek(0) - - for event, element in etree.iterparse(buf, events=('start', 'end')): - # Process begin and end elements - if element.get('class', '') in ('theme-begin', 'theme-end'): - if not event == 'end': continue # Process elements only once, on end event - - # Open new fragment - if element.get('class', '') == 'theme-begin': - fragment = Fragment(id=element.get('fid'), themes=element.text) - - # Append parents - if element.getparent().get('id', None) != 'book-text': - parents = [element.getparent()] - while parents[-1].getparent().get('id', None) != 'book-text': - parents.append(parents[-1].getparent()) - - parents.reverse() - for parent in parents: - fragment.append('start', parent) - - open_fragments[fragment.id] = fragment - - # Close existing fragment - else: - try: - fragment = open_fragments[element.get('fid')] - except KeyError: - print '%s:closed not open fragment #%s' % (input_filename, element.get('fid')) - else: - closed_fragments[fragment.id] = fragment - del open_fragments[fragment.id] - - # Append element tail to lost_text (we don't want to lose any text) - if element.tail: - for fragment_id in open_fragments: - open_fragments[fragment_id].append('text', element.tail) - - - # Process all elements except begin and end - else: - # Omit annotation tags - if (len(element.get('name', '')) or - element.get('class', '') in ('annotation', 'anchor')): - if event == 'end' and element.tail: - for fragment_id in open_fragments: - open_fragments[fragment_id].append('text', element.tail) - else: - for fragment_id in open_fragments: - open_fragments[fragment_id].append(event, copy.copy(element)) - - return closed_fragments, open_fragments - - -def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None): - if with_link: - if link_text is None: - link_text = prefix - anchor = etree.Element('a', href='#%s' % prefix) - anchor.set('class', 'anchor') - anchor.text = unicode(link_text) - if element.text: - anchor.tail = element.text - element.text = u'' - element.insert(0, anchor) - - if with_target: - anchor_target = etree.Element('a', name='%s' % prefix) - anchor_target.set('class', 'target') - anchor_target.text = u' ' - if element.text: - anchor_target.tail = element.text - element.text = u'' - element.insert(0, anchor_target) - - -def any_ancestor(element, test): - for ancestor in element.iterancestors(): - if test(ancestor): - return True - return False - - -def add_anchors(root): - counter = 1 - for element in root.iterdescendants(): - if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication') - or e.get('id') == 'nota_red' - or e.tag == 'blockquote'): - continue - - if element.tag == 'p' and 'verse' in element.get('class', ''): - if counter == 1 or counter % 5 == 0: - add_anchor(element, "f%d" % counter, link_text=counter) - counter += 1 - elif 'paragraph' in element.get('class', ''): - add_anchor(element, "f%d" % counter, link_text=counter) - counter += 1 - - -def raw_printable_text(element): - working = copy.deepcopy(element) - for e in working.findall('a'): - if e.get('class') == 'annotation': - e.text = '' - return etree.tostring(working, method='text', encoding=unicode).strip() - - -def add_table_of_contents(root): - sections = [] - counter = 1 - for element in root.iterdescendants(): - if element.tag in ('h2', 'h3'): - if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)): - continue - - element_text = raw_printable_text(element) - if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2': - sections[-1][3].append((counter, element.tag, element_text, [])) - else: - sections.append((counter, element.tag, element_text, [])) - add_anchor(element, "s%d" % counter, with_link=False) - counter += 1 - - toc = etree.Element('div') - toc.set('id', 'toc') - toc_header = etree.SubElement(toc, 'h2') - toc_header.text = u'Spis treści' - toc_list = etree.SubElement(toc, 'ol') - - for n, section, text, subsections in sections: - section_element = etree.SubElement(toc_list, 'li') - add_anchor(section_element, "s%d" % n, with_target=False, link_text=text) - - if len(subsections): - subsection_list = etree.SubElement(section_element, 'ol') - for n, subsection, text, _ in subsections: - subsection_element = etree.SubElement(subsection_list, 'li') - add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text) - - root.insert(0, toc) - - -def extract_annotations(html_path): - """For each annotation, yields a tuple: anchor, text, html.""" - parser = etree.HTMLParser(encoding='utf-8') - tree = etree.parse(html_path, parser) - footnotes = tree.find('//*[@id="footnotes"]') - if footnotes is not None: - for footnote in footnotes.findall('div'): - anchor = footnote.find('a[@name]').get('name') - del footnote[:2] - text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip() - html_str = etree.tostring(footnote, method='html', encoding='utf-8') - yield anchor, text_str, html_str - diff --git a/librarian/meta.py b/librarian/meta.py new file mode 100755 index 0000000..5b50d92 --- /dev/null +++ b/librarian/meta.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from lxml import etree +from librarian import DCNS, SSTNS + + +def text_value(meta): + """ Finds out the text value of metadata element. + + >>> p = Person() + >>> p.text = u"Czajka, Radek" + >>> print text_value(p) + Radek Czajka + + """ + if hasattr(meta, 'text_value'): + return meta.text_value() + else: + return meta.text + + +class Metadata(etree.ElementBase): + @classmethod + def about(cls, element): + meta = cls() + meta._about = element + return meta + + def get_about(self): + if hasattr(self, '_about'): + return self._about + else: + return self.getparent() + + def get(self, key, inherit=True): + """ Finds metadata by its element name. """ + values = self.findall(key) + if values: + return [text_value(v) for v in values] + elif inherit and self.get_about().getparent() is not None: + return self.get_about().getparent().meta.get(key) + elif inherit and hasattr(self.get_about(), 'meta_context'): + return self.get_about().meta_context.get(key) + else: + return [] + + def get_one(self, *args, **kwargs): + values = self.get(*args, **kwargs) + if values: + return values[0] + else: + return None + + + # Specials. + + def author(self): + try: + return unicode(self.get(DCNS('creator'))[0]) + except IndexError: + return u"" + + def slug(self): + try: + return self.get(DCNS('identifier'))[0].slug() + except IndexError: + return None + + def title(self): + dc_title = self.get(DCNS('title'), inherit=False) + if dc_title: + return unicode(dc_title[0]) + else: + header = self.get_about().find(SSTNS('header')) + if header is not None: + # FIXME: This should be a simple text representation + return header.text + else: + return u"" + + +class MetaItem(etree.ElementBase): + pass + + +class Person(MetaItem): + def text_value(self): + return u" ".join(p.strip() for p in reversed(self.text.rsplit(u',', 1))) + + +class Identifier(MetaItem): + def slug(self): + return self.text.rstrip('/').rsplit('/', 1)[-1] diff --git a/librarian/output.py b/librarian/output.py new file mode 100755 index 0000000..a11f697 --- /dev/null +++ b/librarian/output.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +import os +import shutil + + +class OutputFile(object): + """Represents a file returned by one of the converters.""" + + _string = None + _filename = None + + def __del__(self): + if self._filename: + os.unlink(self._filename) + + def __nonzero__(self): + return self._string is not None or self._filename is not None + + @classmethod + def from_string(cls, string): + """Converter returns contents of a file as a string.""" + + instance = cls() + instance._string = string + return instance + + @classmethod + def from_filename(cls, filename): + """Converter returns contents of a file as a named file.""" + + instance = cls() + instance._filename = filename + return instance + + def get_string(self): + """Get file's contents as a string.""" + + if self._filename is not None: + with open(self._filename) as f: + return f.read() + else: + return self._string + + def get_file(self): + """Get file as a file-like object.""" + + if self._string is not None: + from StringIO import StringIO + return StringIO(self._string) + elif self._filename is not None: + return open(self._filename) + + def get_filename(self): + """Get file as a fs path.""" + + if self._filename is not None: + return self._filename + elif self._string is not None: + from tempfile import NamedTemporaryFile + temp = NamedTemporaryFile(prefix='librarian-', delete=False) + temp.write(self._string) + temp.close() + self._filename = temp.name + return self._filename + else: + return None + + def save_as(self, path): + """Save file to a path. Create directories, if necessary.""" + + dirname = os.path.dirname(os.path.abspath(path)) + if not os.path.isdir(dirname): + os.makedirs(dirname) + shutil.copy(self.get_filename(), path) diff --git a/librarian/parser.py b/librarian/parser.py index a9e8c65..a0b8a7f 100644 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -3,226 +3,28 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from librarian import ValidationError, NoDublinCore, ParseError, NoProvider -from librarian import RDFNS -from librarian.cover import WLCover -from librarian import dcparser - -from xml.parsers.expat import ExpatError from lxml import etree -from lxml.etree import XMLSyntaxError, XSLTApplyError - -import os -import re -from StringIO import StringIO - -class WLDocument(object): - LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE) - provider = None - - def __init__(self, edoc, parse_dublincore=True, provider=None, - strict=False, meta_fallbacks=None): - self.edoc = edoc - self.provider = provider - - root_elem = edoc.getroot() - - dc_path = './/' + RDFNS('RDF') - - if root_elem.tag != 'utwor': - raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag) - - if parse_dublincore: - self.rdf_elem = root_elem.find(dc_path) - - if self.rdf_elem is None: - raise NoDublinCore('Document has no DublinCore - which is required.') - - self.book_info = dcparser.BookInfo.from_element( - self.rdf_elem, fallbacks=meta_fallbacks, strict=strict) - else: - self.book_info = None - - @classmethod - def from_string(cls, xml, *args, **kwargs): - return cls.from_file(StringIO(xml), *args, **kwargs) - - @classmethod - def from_file(cls, xmlfile, *args, **kwargs): - - # first, prepare for parsing - if isinstance(xmlfile, basestring): - file = open(xmlfile, 'rb') - try: - data = file.read() - finally: - file.close() - else: - data = xmlfile.read() - - if not isinstance(data, unicode): - data = data.decode('utf-8') - - data = data.replace(u'\ufeff', '') - - try: - parser = etree.XMLParser(remove_blank_text=False) - tree = etree.parse(StringIO(data.encode('utf-8')), parser) - - return cls(tree, *args, **kwargs) - except (ExpatError, XMLSyntaxError, XSLTApplyError), e: - raise ParseError(e) - - def swap_endlines(self): - """Converts line breaks in stanzas into
tags.""" - # only swap inside stanzas - for elem in self.edoc.iter('strofa'): - for child in list(elem): - if child.tail: - chunks = self.LINE_SWAP_EXPR.split(child.tail) - ins_index = elem.index(child) + 1 - while len(chunks) > 1: - ins = etree.Element('br') - ins.tail = chunks.pop() - elem.insert(ins_index, ins) - child.tail = chunks.pop(0) - if elem.text: - chunks = self.LINE_SWAP_EXPR.split(elem.text) - while len(chunks) > 1: - ins = etree.Element('br') - ins.tail = chunks.pop() - elem.insert(0, ins) - elem.text = chunks.pop(0) - - def parts(self): - if self.provider is None: - raise NoProvider('No document provider supplied.') - if self.book_info is None: - raise NoDublinCore('No Dublin Core in document.') - for part_uri in self.book_info.parts: - yield self.from_file(self.provider.by_uri(part_uri), - provider=self.provider) - - def chunk(self, path): - # convert the path to XPath - expr = self.path_to_xpath(path) - elems = self.edoc.xpath(expr) - - if len(elems) == 0: - return None - else: - return elems[0] - - def path_to_xpath(self, path): - parts = [] - - for part in path.split('/'): - match = re.match(r'([^\[]+)\[(\d+)\]', part) - if not match: - parts.append(part) - else: - tag, n = match.groups() - parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) ) - - if parts[0] == '.': - parts[0] = '' - - return '/'.join(parts) - - def transform(self, stylesheet, **options): - return self.edoc.xslt(stylesheet, **options) - - def update_dc(self): - if self.book_info: - parent = self.rdf_elem.getparent() - parent.replace( self.rdf_elem, self.book_info.to_etree(parent) ) - - def serialize(self): - self.update_dc() - return etree.tostring(self.edoc, encoding=unicode, pretty_print=True) - - def merge_chunks(self, chunk_dict): - unmerged = [] - - for key, data in chunk_dict.iteritems(): - try: - xpath = self.path_to_xpath(key) - node = self.edoc.xpath(xpath)[0] - repl = etree.fromstring(u"<%s>%s" %(node.tag, data, node.tag) ) - node.getparent().replace(node, repl) - except Exception, e: - unmerged.append( repr( (key, xpath, e) ) ) - - return unmerged - - def clean_ed_note(self): - """ deletes forbidden tags from nota_red """ - - for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in - ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))): - tail = node.tail - node.clear() - node.tag = 'span' - node.tail = tail - - def editors(self): - """Returns a set of all editors for book and its children. - - :returns: set of dcparser.Person objects - """ - if self.book_info is None: - raise NoDublinCore('No Dublin Core in document.') - persons = set(self.book_info.editors + - self.book_info.technical_editors) - for child in self.parts(): - persons.update(child.editors()) - if None in persons: - persons.remove(None) - return persons - - # Converters - - def as_html(self, *args, **kwargs): - from librarian import html - return html.transform(self, *args, **kwargs) - - def as_text(self, *args, **kwargs): - from librarian import text - return text.transform(self, *args, **kwargs) - - def as_epub(self, *args, **kwargs): - from librarian import epub - return epub.transform(self, *args, **kwargs) - - def as_pdf(self, *args, **kwargs): - from librarian import pdf - return pdf.transform(self, *args, **kwargs) - - def as_mobi(self, *args, **kwargs): - from librarian import mobi - return mobi.transform(self, *args, **kwargs) - - def as_fb2(self, *args, **kwargs): - from librarian import fb2 - return fb2.transform(self, *args, **kwargs) - - def as_cover(self, cover_class=None, *args, **kwargs): - if cover_class is None: - cover_class = WLCover - return cover_class(self.book_info, *args, **kwargs).output_file() - - def save_output_file(self, output_file, output_path=None, - output_dir_path=None, make_author_dir=False, ext=None): - if output_dir_path: - save_path = output_dir_path - if make_author_dir: - save_path = os.path.join(save_path, - unicode(self.book_info.author).encode('utf-8')) - save_path = os.path.join(save_path, - self.book_info.uri.slug) - if ext: - save_path += '.%s' % ext - else: - save_path = output_path - - output_file.save_as(save_path) +from . import DCNS, SSTNS +from . import core, meta + + +class SSTParser(etree.XMLParser): + """ XML parser using relevant element classes. """ + def __init__(self): + super(SSTParser, self).__init__(remove_blank_text=False) + lookup = etree.ElementNamespaceClassLookup() + self.set_element_class_lookup(lookup) + + # Define core language tags. + sst_ns = lookup.get_namespace(SSTNS.uri) + sst_ns['aside'] = core.Aside + sst_ns['div'] = core.Div + sst_ns['header'] = core.Header + sst_ns['section'] = core.Section + sst_ns['span'] = core.Span + sst_ns['metadata'] = meta.Metadata + + # Define any special metadata. + dc_ns = lookup.get_namespace(DCNS.uri) + dc_ns['creator'] = meta.Person + dc_ns['identifier'] = meta.Identifier diff --git a/librarian/picture.py b/librarian/picture.py deleted file mode 100644 index ee3c61d..0000000 --- a/librarian/picture.py +++ /dev/null @@ -1,173 +0,0 @@ - -from dcparser import (as_person, as_date, Field, WorkInfo, DCNS) -from librarian import (RDFNS, ValidationError, NoDublinCore, ParseError, WLURI) -from xml.parsers.expat import ExpatError -from os import path -from StringIO import StringIO -from lxml import etree -from lxml.etree import (XMLSyntaxError, XSLTApplyError) -import re - - -class WLPictureURI(WLURI): - _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/' - '(?P[-a-z0-9]+)/?$') - - @classmethod - def from_slug(cls, slug): - uri = 'http://wolnelektury.pl/katalog/obraz/%s/' % slug - return cls(uri) - -def as_wlpictureuri_strict(text): - return WLPictureURI.strict(text) - - -class PictureInfo(WorkInfo): - """ - Dublin core metadata for a picture - """ - FIELDS = ( - Field(DCNS('language'), 'language', required=False), - Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True), - Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True), - - Field(DCNS('format.dimensions'), 'dimensions', required=False), - Field(DCNS('format.checksum.sha1'), 'sha1', required=True), - Field(DCNS('description.medium'), 'medium', required=False), - Field(DCNS('description.dimensions'), 'original_dimensions', required=False), - Field(DCNS('format'), 'mime_type', required=False), - Field(DCNS('identifier.url'), 'url', WLPictureURI, - strict=as_wlpictureuri_strict), - ) - - -class ImageStore(object): - EXT = ['gif', 'jpeg', 'png', 'swf', 'psd', 'bmp' - 'tiff', 'tiff', 'jpc', 'jp2', 'jpf', 'jb2', 'swc', - 'aiff', 'wbmp', 'xbm'] - MIME = ['image/gif', 'image/jpeg', 'image/png', - 'application/x-shockwave-flash', 'image/psd', 'image/bmp', - 'image/tiff', 'image/tiff', 'application/octet-stream', - 'image/jp2', 'application/octet-stream', 'application/octet-stream', - 'application/x-shockwave-flash', 'image/iff', 'image/vnd.wap.wbmp', 'image/xbm'] - - def __init__(self, dir_): - self.dir = dir_ - return super(ImageStore, self).__init__() - - def path(self, slug, mime_type): - """ - Finds file by slug and mime type in our iamge store. - Returns a file objects (perhaps should return a filename?) - """ - try: - i = self.MIME.index(mime_type) - except ValueError: - err = ValueError("Picture %s has unknown mime type: %s" % (slug, mime_type)) - err.slug = slug - err.mime_type = mime_type - raise err - ext = self.EXT[i] - # add some common extensions tiff->tif, jpeg->jpg - return path.join(self.dir, slug + '.' + ext) - - -class WLPicture(object): - def __init__(self, edoc, parse_dublincore=True, image_store=None): - self.edoc = edoc - self.image_store = image_store - - root_elem = edoc.getroot() - - dc_path = './/' + RDFNS('RDF') - - if root_elem.tag != 'picture': - raise ValidationError("Invalid root element. Found '%s', should be 'picture'" % root_elem.tag) - - if parse_dublincore: - self.rdf_elem = root_elem.find(dc_path) - - if self.rdf_elem is None: - raise NoDublinCore('Document has no DublinCore - which is required.') - - self.picture_info = PictureInfo.from_element(self.rdf_elem) - else: - self.picture_info = None - - @classmethod - def from_string(cls, xml, *args, **kwargs): - return cls.from_file(StringIO(xml), *args, **kwargs) - - @classmethod - def from_file(cls, xmlfile, parse_dublincore=True, image_store=None): - - # first, prepare for parsing - if isinstance(xmlfile, basestring): - file = open(xmlfile, 'rb') - try: - data = file.read() - finally: - file.close() - else: - data = xmlfile.read() - - if not isinstance(data, unicode): - data = data.decode('utf-8') - - data = data.replace(u'\ufeff', '') - - # assume images are in the same directory - if image_store is None and xmlfile.name is not None: - image_store = ImageStore(path.dirname(xmlfile.name)) - - try: - parser = etree.XMLParser(remove_blank_text=False) - tree = etree.parse(StringIO(data.encode('utf-8')), parser) - - return cls(tree, parse_dublincore=parse_dublincore, image_store=image_store) - except (ExpatError, XMLSyntaxError, XSLTApplyError), e: - raise ParseError(e) - - @property - def mime_type(self): - if self.picture_info is None: - raise ValueError('DC is not loaded, hence we don\'t know the image type') - return self.picture_info.mime_type - - @property - def slug(self): - return self.picture_info.url.slug - - @property - def image_path(self): - if self.image_store is None: - raise ValueError("No image store associated with whis WLPicture.") - return self.image_store.path(self.slug, self.mime_type) - - def image_file(self, *args, **kwargs): - return open(self.image_path, *args, **kwargs) - - def partiter(self): - """ - Iterates the parts of this picture and returns them and their metadata - """ - for part in self.edoc.iter("div"): - pd = {} - pd['type'] = part.get('type') - if pd['type'] == 'area': - pd['coords'] = ((int(part.get('x1')), int(part.get('y1'))), - (int(part.get('x2')), int(part.get('y2')))) - - pd['themes'] = [] - pd['object'] = None - parent = part - while True: - parent = parent.getparent() - if parent is None: - break - if parent.tag == 'sem': - if parent.get('type') == 'theme': - pd['themes'] += map(unicode.strip, unicode(parent.get('theme')).split(',')) - elif parent.get('type') == 'object' and pd['object'] is None: - pd['object'] = parent.get('object') - yield pd diff --git a/librarian/renderers.py b/librarian/renderers.py new file mode 100755 index 0000000..59ed8a4 --- /dev/null +++ b/librarian/renderers.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from lxml import etree +from . import UnicodeException +from .utils import extend_element + + +class UnknownElement(UnicodeException): + pass + + +class Renderer(object): + """ Renders an element in a context to some kind of container. """ + def render(self, element, ctx): + """ Renders the element in the context. """ + raise NotImplemented + + def render_text(self, text, ctx): + """ Renders the text in the context. """ + raise NotImplemented + + +class TreeRenderer(Renderer): + """ Renders insides as XML in a <_/> container. """ + root_name = "_" + + def __init__(self, tag_name=None, attrib=None): + self.tag_name = tag_name + self.attrib = attrib or {} + + def container(self): + root = etree.Element(self.root_name) + if self.tag_name: + inner = etree.Element(self.tag_name, **self.attrib) + root.append(inner) + return root, inner + else: + return root, root + + def text_container(self): + root = etree.Element(self.root_name) + return root, root + + def subcontext(self, element, ctx): + return ctx + + def get_insides(self, element, ctx): + subctx = self.subcontext(element, ctx) + if element.text: + yield self.render_text(element.text, ctx) + for child in element: + try: + yield ctx.format.render(child, subctx) + except UnknownElement: + pass + if child.tail: + yield self.render_text(child.tail, ctx) + + def render(self, element, ctx): + root, inner = self.container() + for inside in self.get_insides(element, ctx): + extend_element(inner, inside) + return root + + def render_text(self, text, ctx): + root, inner = self.text_container() + inner.text = text + return root + + + +class Register(object): + """ Class-renderer register. + + >>> from librarian.core import Div + >>> renderer = Renderer() + >>> reg = Register() + >>> reg.register(Div, 'a.b', renderer) + >>> reg.get(Div, 'a.b.c') is renderer + True + + """ + def __init__(self): + self.classes = {} + + def register(self, tag, klass, renderer): + self.classes[tag, klass] = renderer + + def get(self, tag, klass=None): + while klass: + try: + return self.classes[tag, klass] + except KeyError: + try: + klass = klass.rsplit('.', 1)[-2] + except IndexError: + klass = None + try: + return self.classes[tag, None] + except KeyError: + raise UnknownElement(tag) + + def get_for(self, element): + return self.get(type(element), element.get('class')) diff --git a/librarian/utils.py b/librarian/utils.py new file mode 100755 index 0000000..25936bf --- /dev/null +++ b/librarian/utils.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +import os + + +class Context(object): + """ Processing context. + + >>> ctx = Context(a=1) + >>> subctx = Context(ctx, a=2) + >>> ctx.b = 3 + >>> print subctx.a, subctx.b + 2 3 + + """ + def __init__(self, _upctx=None, **initial): + object.__setattr__(self, '_upctx', _upctx) + object.__setattr__(self, '_data', initial or {}) + + def __getattr__(self, name): + if name in self._data: + return self._data[name] + elif self._upctx is not None: + return getattr(self._upctx, name) + else: + raise AttributeError + + def __setattr__(self, name, value): + try: + self.try_setattr(name, value) + except ValueError: + self._data[name] = value + + def try_setattr(self, name, value): + if name in self._data: + self._data[name] = value + elif self._upctx is not None: + self._upctx.try_setattr(name, value) + else: + raise ValueError + + +class XMLNamespace(object): + '''A handy structure to repsent names in an XML namespace.''' + def __init__(self, uri): + self.uri = uri + + def __call__(self, tag): + return '{%s}%s' % (self.uri, tag) + + def __contains__(self, tag): + return tag.startswith('{' + str(self) + '}') + + def __repr__(self): + return 'XMLNamespace(%r)' % self.uri + + def __str__(self): + return '%s' % self.uri + + +def extend_element(container, element=None, text=None): + """ Extends XML element with another one's contents. + + Differs from etree.Element.extend by taking the text into account. + + >>> from lxml import etree + >>> container = etree.fromstring("") + >>> element = etree.fromstring("<_>ac") + >>> extend_element(container, element) + >>> print etree.tostring(container) + ac + + """ + add_text = (text or "") + (element.text or "" if element is not None else "") + if add_text: + if len(container): + container[-1].tail = (container[-1].tail or "") + add_text + else: + container.text = (container.text or "") + add_text + if element is not None: + container.extend(element) + + +def get_resource(path): + return os.path.join(os.path.dirname(__file__), path) diff --git a/scripts/book2cover b/scripts/book2cover index 758ab0e..977096e 100755 --- a/scripts/book2cover +++ b/scripts/book2cover @@ -4,18 +4,14 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from StringIO import StringIO -from librarian import OutputFile from librarian.book2anything import Book2Anything, Option +from librarian.formats.cover.wolnelektury import WLCover class Book2Cover(Book2Anything): - format_name = "JPEG" - ext = "jpg" - uses_cover = True - cover_optional = False + format_cls = WLCover - transform_options = [ + format_options = [ Option('-W', '--width', action='store', type='int', dest='width', default=None, help='Set width.'), Option('-H', '--height', action='store', type='int', dest='height', default=None, @@ -25,10 +21,6 @@ class Book2Cover(Book2Anything): help='Add WL logo in white box.'), ] - @staticmethod - def transform(wldoc, cover, *args, **kwargs): - return wldoc.as_cover(cover_class=cover, *args, **kwargs) - if __name__ == '__main__': Book2Cover.run() diff --git a/scripts/book2epub b/scripts/book2epub index 01ca79a..4d061f0 100755 --- a/scripts/book2epub +++ b/scripts/book2epub @@ -5,19 +5,11 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # from librarian.book2anything import Book2Anything, Option +from librarian.formats.epub import EpubFormat class Book2Epub(Book2Anything): - format_name = "EPUB" - ext = "epub" - uses_cover = True - uses_provider = True - transform_flags = [ - Option('-w', '--working-copy', dest='working-copy', - action='store_true', default=False, - help='mark the output as a working copy') - ] - + format_cls = EpubFormat if __name__ == '__main__': Book2Epub.run() diff --git a/scripts/book2html b/scripts/book2html index 5d48eec..6c1e1c6 100755 --- a/scripts/book2html +++ b/scripts/book2html @@ -5,23 +5,17 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # from librarian.book2anything import Book2Anything, Option +from librarian.formats.html import HtmlFormat class Book2Html(Book2Anything): - format_name = "HTML" - ext = "html" - uses_cover = False - uses_provider = False - transform_flags = [ - Option('-r', '--raw', dest='full-page', + format_cls = HtmlFormat + + format_options = [ + Option('-r', '--raw', dest='standalone', action='store_false', default=True, help='output raw text for use in templates') ] - parser_args = [ - Option('-i', '--ignore-dublin-core', dest='parse_dublincore', - action='store_false', default=True, - help='don\'t try to parse dublin core metadata') - ] if __name__ == '__main__': diff --git a/setup.py b/setup.py index 51003ef..a0e4e53 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ # import os import os.path -from distutils.core import setup +from setuptools import setup, find_packages def whole_tree(prefix, path): files = [] @@ -21,18 +21,26 @@ def whole_tree(prefix, path): setup( name='librarian', - version='1.5.1', + version='2.0a', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', author="Marek Stępniowski", author_email='marek@stepniowski.com', maintainer='Radek Czajka', maintainer_email='radoslaw.czajka@nowoczesnapolska.org.pl', url='http://github.com/fnp/librarian', - packages=['librarian'], - package_data={'librarian': ['xslt/*.xslt', 'epub/*', 'mobi/*', 'pdf/*', 'fb2/*', 'fonts/*', 'res/*'] + - whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'font-optimizer')}, + packages=find_packages(), + package_data={ + 'librarian': ['xslt/*.xslt', 'epub/*', 'html/*', 'mobi/*', 'pdf/*', 'fb2/*', 'fonts/*', 'res/*'] + + whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'font-optimizer'), + 'librarian.formats.html': ['res/*'], + 'librarian.formats.epub': ['res/*'], + }, include_package_data=True, - install_requires=['lxml>=2.2'], + install_requires=[ + 'lxml>=2.2', + 'pillow', + 'Texml', + ], scripts=['scripts/book2html', 'scripts/book2txt', 'scripts/book2epub',