From: Marek Stępniowski Date: Fri, 19 Mar 2010 16:05:06 +0000 (+0100) Subject: Version 1.3.1dev. X-Git-Tag: 1.7~297 X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/b25c0f434cc50d48379fc8aa4c1bf5ea32212396 Version 1.3.1dev. --- diff --git a/MANIFEST.in b/MANIFEST.in index 38ee542..ff210ef 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ -include librarian/*.xslt -include librarian/config.xml +include ez_setup.py +include librarian/xslt/*.xslt +include librarian/xslt/config.xml diff --git a/librarian/__init__.py b/librarian/__init__.py old mode 100644 new mode 100755 index 5997a4e..d56440d --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -85,4 +85,25 @@ def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO): method='xml', encoding=unicode, pretty_print=True) return u'\n' + dcstring + u'\n\n' + ocrtext +\ - u'\n\n'; \ No newline at end of file + u'\n\n'; + + +def serialize_raw(element): + b = u'' + (element.text or '') + + for child in element.iterchildren(): + e = etree.tostring(child, method='xml', encoding=unicode, pretty_print=True) + b += e + + return b + +from wl_light import serialize_nl + + +SERIALIZERS = { + 'raw': serialize_raw, + 'nl': serialize_nl, +} + +def serialize_children(element, format='raw'): + return SERIALIZERS[format](element) diff --git a/librarian/book2html.xslt b/librarian/book2html.xslt deleted file mode 100644 index 369b542..0000000 --- a/librarian/book2html.xslt +++ /dev/null @@ -1,614 +0,0 @@ - - - - - - - -
- - -
-

Przypisy

- -
- - [] - - -

-
- - - -
-
-
-
-
-
- -
- - - - - - - - -

- -

-
- -
- - - - - - - -
-
- - -
-

-
    - -
-
-
- - -
-
- - -
- -
-
- - -
-
- - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - -

-
- - -

-
- - -

-
- - - -

-
- - -
-
- - -
  • -
    - - -

    -
    - - -
    - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -

    - - - padding-left: 1em - - - - - padding-left: em - - - padding-left: 1em - - - - - padding-left: 12em - - - -

    -
    - - -

    -
    - - - - - - - - - - [] - - - - - - - - - - - - - - - - - - „” - - - - - - - - - - - - - - - - - -
    -
    - - -

    *

    -
    - - -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    \ No newline at end of file diff --git a/librarian/book2txt.xslt b/librarian/book2txt.xslt deleted file mode 100644 index 72f193d..0000000 --- a/librarian/book2txt.xslt +++ /dev/null @@ -1,312 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -/ / - - - - - * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -„” - - - -** - - - - - - - - - - - - - - - - - - - - - - -* - - - - - - - ------------------------------------------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/librarian/config.xml b/librarian/config.xml deleted file mode 100644 index e1f4b6f..0000000 --- a/librarian/config.xml +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    \ No newline at end of file diff --git a/librarian/html.py b/librarian/html.py index 6551995..ad1b8bd 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -19,9 +19,9 @@ ENTITY_SUBSTITUTIONS = [ ] STYLESHEETS = { - 'legacy': 'book2html.xslt', - 'full': 'wl2html_full.xslt', - 'partial': 'wl2html_partial.xslt' + 'legacy': 'xslt/book2html.xslt', + 'full': 'xslt/wl2html_full.xslt', + 'partial': 'xslt/wl2html_partial.xslt' } def get_stylesheet(name): @@ -67,7 +67,6 @@ def transform(input, output_filename=None, is_file=True, \ return result return True else: - print "[Librarian] didn't find any paragraphs" return "" except KeyError: raise ValueError("'%s' is not a valid stylesheet.") diff --git a/librarian/parser.py b/librarian/parser.py old mode 100644 new mode 100755 index 55b4e4b..ae4ffa0 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from librarian import ValidationError, NoDublinCore, ParseError -from librarian import RDFNS, DCNS +from librarian import RDFNS from librarian import dcparser from xml.parsers.expat import ExpatError @@ -57,23 +57,20 @@ class WLDocument(object): data = cls.LINE_SWAP_EXPR.sub(u'
    \n', data) try: - parser = etree.XMLParser(remove_blank_text=True) + parser = etree.XMLParser(remove_blank_text=False) return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore) except (ExpatError, XMLSyntaxError, XSLTApplyError), e: raise ParseError(e) - def part_as_text(self, path): + def chunk(self, path): # convert the path to XPath - print "[L] Retrieving part:", path + expr = self.path_to_xpath(path) + elems = self.edoc.xpath(expr) - elems = self.edoc.xpath(self.path_to_xpath(path)) - print "[L] xpath", elems - if len(elems) == 0: - return None - - return etree.tostring(elems[0], encoding=unicode, pretty_print=True) - + return None + else: + return elems[0] def path_to_xpath(self, path): parts = [] @@ -84,7 +81,7 @@ class WLDocument(object): parts.append(part) else: tag, n = match.groups() - parts.append("node()[position() = %d and name() = '%s']" % (int(n), tag) ) + parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) ) if parts[0] == '.': parts[0] = '' @@ -95,8 +92,9 @@ class WLDocument(object): return self.edoc.xslt(stylesheet, **options) def update_dc(self): - parent = self.rdf_elem.getparent() - parent.replace( self.rdf_elem, self.book_info.to_etree(parent) ) + if self.book_info: + parent = self.rdf_elem.getparent() + parent.replace( self.rdf_elem, self.book_info.to_etree(parent) ) def serialize(self): self.update_dc() @@ -108,8 +106,8 @@ class WLDocument(object): for key, data in chunk_dict.iteritems(): try: xpath = self.path_to_xpath(key) - node = self.edoc.xpath(xpath)[0] - repl = etree.fromstring(data) + node = self.edoc.xpath(xpath)[0] + repl = etree.fromstring(u"<%s>%s" %(node.tag, data, node.tag) ) node.getparent().replace(node, repl); except Exception, e: unmerged.append( repr( (key, xpath, e) ) ) diff --git a/librarian/text.py b/librarian/text.py index 972dd61..931a152 100644 --- a/librarian/text.py +++ b/librarian/text.py @@ -79,7 +79,7 @@ ns['wrap_words'] = wrap_words def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options): """Transforms file input_filename in XML to output_filename in TXT.""" # Parse XSLT - style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt') + style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt') style = etree.parse(style_filename) if is_file: diff --git a/librarian/wl2html_base.xslt b/librarian/wl2html_base.xslt deleted file mode 100644 index cd31ef1..0000000 --- a/librarian/wl2html_base.xslt +++ /dev/null @@ -1,376 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - editable - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Nieznany tag '' :(. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      - - - - - - -
    -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    diff --git a/librarian/wl2html_full.xslt b/librarian/wl2html_full.xslt deleted file mode 100644 index deaf0c5..0000000 --- a/librarian/wl2html_full.xslt +++ /dev/null @@ -1,27 +0,0 @@ - - - - - - - - - - -
    - - - - - - - - - - -
    -
    - -
    \ No newline at end of file diff --git a/librarian/wl2html_partial.xslt b/librarian/wl2html_partial.xslt deleted file mode 100644 index 0fdca74..0000000 --- a/librarian/wl2html_partial.xslt +++ /dev/null @@ -1,20 +0,0 @@ - - - - - - - - - - - Processing... - - - - - - - \ No newline at end of file diff --git a/librarian/wl_light.py b/librarian/wl_light.py new file mode 100644 index 0000000..a39e8e0 --- /dev/null +++ b/librarian/wl_light.py @@ -0,0 +1,47 @@ +# -*- encoding: utf-8 -*- + +__author__= "Łukasz Rekucki" +__date__ = "$2009-10-19 16:31:14$" +__doc__ = "Functions to operate on a tag-light version of WLML." + +class LightSerializer(object): + + def __init__(self): + pass + + def serialize(self, element): + handler = getattr(self, 'serialize_' + element.tag, self.identity) + return handler(element) + (element.tail or u'') + + def serialize_slowo_obce(self, e): + return u' %%'+self.descent(e)+u'%% ' + + def descent(self, e): + b = (e.text or u'') + for child in e.iterchildren(): + b += self.serialize(child) + return b + + def identity(self, e): + b = u'<'+e.tag + + # attributes + b += u' '.join((u'%s="%s"' % (attr, value) for attr,value in e.items())) + b += u'>' + b += self.descent(e) + b += u'' + + return b + +_serializer = LightSerializer() + +def serialize_nl(element): + prolog = u'' + element.text # ordinary stuff + data = u'' + + for child in element.iterchildren(): + data += _serializer.serialize(child) + + return prolog + data + + diff --git a/librarian/xslt/book2html.xslt b/librarian/xslt/book2html.xslt new file mode 100644 index 0000000..369b542 --- /dev/null +++ b/librarian/xslt/book2html.xslt @@ -0,0 +1,614 @@ + + + + + + + +
    + + +
    +

    Przypisy

    + +
    + + [] + + +

    +
    + + + +
    +
    +
    +
    +
    +
    + +
    + + + + + + + + +

    + +

    +
    + +
    + + + + + + + +
    +
    + + +
    +

    +
      + +
    +
    +
    + + +
    +
    + + +
    + +
    +
    + + +
    +
    + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + +

    +
    + + +

    +
    + + +

    +
    + + + +

    +
    + + +
    +
    + + +
  • +
    + + +

    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +

    + + + padding-left: 1em + + + + + padding-left: em + + + padding-left: 1em + + + + + padding-left: 12em + + + +

    +
    + + +

    +
    + + + + + + + + + + [] + + + + + + + + + + + + + + + + + + „” + + + + + + + + + + + + + + + + + +
    +
    + + +

    *

    +
    + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    \ No newline at end of file diff --git a/librarian/xslt/book2txt.xslt b/librarian/xslt/book2txt.xslt new file mode 100644 index 0000000..72f193d --- /dev/null +++ b/librarian/xslt/book2txt.xslt @@ -0,0 +1,312 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +/ / + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +„” + + + +** + + + + + + + + + + + + + + + + + + + + + + +* + + + + + + + +------------------------------------------------ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/librarian/xslt/config.xml b/librarian/xslt/config.xml new file mode 100644 index 0000000..68f4ea9 --- /dev/null +++ b/librarian/xslt/config.xml @@ -0,0 +1,129 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    diff --git a/librarian/xslt/normalize.xslt b/librarian/xslt/normalize.xslt new file mode 100755 index 0000000..f5e0314 --- /dev/null +++ b/librarian/xslt/normalize.xslt @@ -0,0 +1,393 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + p + + + + + + + + + + + + + + + + + + dot + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ` + + + + + + ` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Nieprzetworzony węzeł: + + + + + + + + \ No newline at end of file diff --git a/librarian/xslt/wl2fo.xslt b/librarian/xslt/wl2fo.xslt new file mode 100755 index 0000000..fddb2c4 --- /dev/null +++ b/librarian/xslt/wl2fo.xslt @@ -0,0 +1,338 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + —  + + + + + + + + + + + + + + + + + + - + + + always + + + + always + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/librarian/xslt/wl2html_base.xslt b/librarian/xslt/wl2html_base.xslt new file mode 100755 index 0000000..725abac --- /dev/null +++ b/librarian/xslt/wl2html_base.xslt @@ -0,0 +1,357 @@ + + + + + + + + + + + + + + + + + + + + + + editable + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + + + + + + +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    Uwaga! Tekst poza paragrafem

    +

    ##

    +
    +
    +
    +
    + + + + + + + + + +
    + +
    + + + +
    diff --git a/librarian/xslt/wl2html_full.xslt b/librarian/xslt/wl2html_full.xslt new file mode 100755 index 0000000..02c89c7 --- /dev/null +++ b/librarian/xslt/wl2html_full.xslt @@ -0,0 +1,26 @@ + + + + + + + + + +
    + + + + + + + + + + +
    +
    + +
    \ No newline at end of file diff --git a/librarian/xslt/wl2html_partial.xslt b/librarian/xslt/wl2html_partial.xslt new file mode 100755 index 0000000..ffd65f2 --- /dev/null +++ b/librarian/xslt/wl2html_partial.xslt @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/scripts/normalize.py b/scripts/normalize.py new file mode 100755 index 0000000..b927269 --- /dev/null +++ b/scripts/normalize.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +from __future__ import with_statement + +import re +import sys +import os.path + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from StringIO import StringIO +from lxml import etree +import librarian + +REPLACEMENTS = ( + (u'---', u'\u2014'), # mdash + (u'--', u'\u2013'), # ndash + (u'...', u'\u2026'), # ldots + (u',,', u'\u201E'), # lower double back-quote + (u'"', u'\u201D'), # upper double quote +) + +DIALOG_EXPR = re.compile(r"\s*---\s(.*)") + +def wl_normalize_text(context, text): + """XPath extension function converting all entites in passed text.""" + if isinstance(text, list): + text = u''.join(text) + + for code, ucode in REPLACEMENTS: + text = text.replace(code, ucode) + + return text + +def wl_fix_dialog(context, data): + + if isinstance(data, list): + text = u''.join(data) + else: + text = data + + m = DIALOG_EXPR.match(text) + + if m is not None: + return m.group(1) + else: + return text + + +def filter_verse_ends(data): + return data.replace('/\n', '
    ') + +ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') +ns['normalize-text'] = wl_normalize_text +ns['fix-dialog-line'] = wl_fix_dialog + +def normalize_stylesheet(): + return etree.XSLT(etree.parse(os.path.join(os.path.dirname(librarian.__file__), 'xslt', 'normalize.xslt'))) + +if __name__ == '__main__': + tran = normalize_stylesheet() + input = StringIO( f ) + doc = trans( etree.parse(input) ) + print etree.tostring(doc, pretty_print=True, encoding=unicode).encode('utf-8') + + for err in trans.error_log: + sys.stderr.write( (u"%s\n" % err).encode('utf-8') ) + diff --git a/setup.cfg b/setup.cfg index f2f658c..abee305 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,6 @@ +[egg_info] +tag_build = .dev +tag_date = 1 + [aliases] test = nosetests --detailed-errors --with-doctest --with-coverage --cover-package=librarian diff --git a/setup.py b/setup.py index 34d016e..bad0a3f 100755 --- a/setup.py +++ b/setup.py @@ -8,9 +8,9 @@ from setuptools import setup, find_packages setup( name='librarian', - version='1.2.5', + version='1.3', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', - author='Marek Stępniowski', + author='Marek Stepniowski', author_email='marek@stepniowski.com', url='http://redmine.nowoczesnapolska.org.pl/', packages=find_packages(exclude=['tests']),