From b25c0f434cc50d48379fc8aa4c1bf5ea32212396 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20St=C4=99pniowski?= Date: Fri, 19 Mar 2010 17:05:06 +0100 Subject: [PATCH] Version 1.3.1dev. --- MANIFEST.in | 5 +- librarian/__init__.py | 23 +- librarian/html.py | 7 +- librarian/parser.py | 30 +- librarian/text.py | 2 +- librarian/wl_light.py | 47 +++ librarian/{ => xslt}/book2html.xslt | 0 librarian/{ => xslt}/book2txt.xslt | 0 librarian/{ => xslt}/config.xml | 22 +- librarian/xslt/normalize.xslt | 393 ++++++++++++++++++++++ librarian/xslt/wl2fo.xslt | 338 +++++++++++++++++++ librarian/{ => xslt}/wl2html_base.xslt | 161 ++++----- librarian/{ => xslt}/wl2html_full.xslt | 13 +- librarian/{ => xslt}/wl2html_partial.xslt | 18 +- scripts/normalize.py | 68 ++++ setup.cfg | 4 + setup.py | 4 +- 17 files changed, 997 insertions(+), 138 deletions(-) mode change 100644 => 100755 librarian/__init__.py mode change 100644 => 100755 librarian/parser.py create mode 100644 librarian/wl_light.py rename librarian/{ => xslt}/book2html.xslt (100%) rename librarian/{ => xslt}/book2txt.xslt (100%) rename librarian/{ => xslt}/config.xml (84%) create mode 100755 librarian/xslt/normalize.xslt create mode 100755 librarian/xslt/wl2fo.xslt rename librarian/{ => xslt}/wl2html_base.xslt (72%) mode change 100644 => 100755 rename librarian/{ => xslt}/wl2html_full.xslt (56%) mode change 100644 => 100755 rename librarian/{ => xslt}/wl2html_partial.xslt (56%) mode change 100644 => 100755 create mode 100755 scripts/normalize.py diff --git a/MANIFEST.in b/MANIFEST.in index 38ee542..ff210ef 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ -include librarian/*.xslt -include librarian/config.xml +include ez_setup.py +include librarian/xslt/*.xslt +include librarian/xslt/config.xml diff --git a/librarian/__init__.py b/librarian/__init__.py old mode 100644 new mode 100755 index 5997a4e..d56440d --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -85,4 +85,25 @@ def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO): method='xml', encoding=unicode, pretty_print=True) return u'\n' + dcstring + u'\n\n' + ocrtext +\ - u'\n\n'; \ No newline at end of file + u'\n\n'; + + +def serialize_raw(element): + b = u'' + (element.text or '') + + for child in element.iterchildren(): + e = etree.tostring(child, method='xml', encoding=unicode, pretty_print=True) + b += e + + return b + +from wl_light import serialize_nl + + +SERIALIZERS = { + 'raw': serialize_raw, + 'nl': serialize_nl, +} + +def serialize_children(element, format='raw'): + return SERIALIZERS[format](element) diff --git a/librarian/html.py b/librarian/html.py index 6551995..ad1b8bd 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -19,9 +19,9 @@ ENTITY_SUBSTITUTIONS = [ ] STYLESHEETS = { - 'legacy': 'book2html.xslt', - 'full': 'wl2html_full.xslt', - 'partial': 'wl2html_partial.xslt' + 'legacy': 'xslt/book2html.xslt', + 'full': 'xslt/wl2html_full.xslt', + 'partial': 'xslt/wl2html_partial.xslt' } def get_stylesheet(name): @@ -67,7 +67,6 @@ def transform(input, output_filename=None, is_file=True, \ return result return True else: - print "[Librarian] didn't find any paragraphs" return "" except KeyError: raise ValueError("'%s' is not a valid stylesheet.") diff --git a/librarian/parser.py b/librarian/parser.py old mode 100644 new mode 100755 index 55b4e4b..ae4ffa0 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from librarian import ValidationError, NoDublinCore, ParseError -from librarian import RDFNS, DCNS +from librarian import RDFNS from librarian import dcparser from xml.parsers.expat import ExpatError @@ -57,23 +57,20 @@ class WLDocument(object): data = cls.LINE_SWAP_EXPR.sub(u'
\n', data) try: - parser = etree.XMLParser(remove_blank_text=True) + parser = etree.XMLParser(remove_blank_text=False) return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore) except (ExpatError, XMLSyntaxError, XSLTApplyError), e: raise ParseError(e) - def part_as_text(self, path): + def chunk(self, path): # convert the path to XPath - print "[L] Retrieving part:", path + expr = self.path_to_xpath(path) + elems = self.edoc.xpath(expr) - elems = self.edoc.xpath(self.path_to_xpath(path)) - print "[L] xpath", elems - if len(elems) == 0: - return None - - return etree.tostring(elems[0], encoding=unicode, pretty_print=True) - + return None + else: + return elems[0] def path_to_xpath(self, path): parts = [] @@ -84,7 +81,7 @@ class WLDocument(object): parts.append(part) else: tag, n = match.groups() - parts.append("node()[position() = %d and name() = '%s']" % (int(n), tag) ) + parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) ) if parts[0] == '.': parts[0] = '' @@ -95,8 +92,9 @@ class WLDocument(object): return self.edoc.xslt(stylesheet, **options) def update_dc(self): - parent = self.rdf_elem.getparent() - parent.replace( self.rdf_elem, self.book_info.to_etree(parent) ) + if self.book_info: + parent = self.rdf_elem.getparent() + parent.replace( self.rdf_elem, self.book_info.to_etree(parent) ) def serialize(self): self.update_dc() @@ -108,8 +106,8 @@ class WLDocument(object): for key, data in chunk_dict.iteritems(): try: xpath = self.path_to_xpath(key) - node = self.edoc.xpath(xpath)[0] - repl = etree.fromstring(data) + node = self.edoc.xpath(xpath)[0] + repl = etree.fromstring(u"<%s>%s" %(node.tag, data, node.tag) ) node.getparent().replace(node, repl); except Exception, e: unmerged.append( repr( (key, xpath, e) ) ) diff --git a/librarian/text.py b/librarian/text.py index 972dd61..931a152 100644 --- a/librarian/text.py +++ b/librarian/text.py @@ -79,7 +79,7 @@ ns['wrap_words'] = wrap_words def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options): """Transforms file input_filename in XML to output_filename in TXT.""" # Parse XSLT - style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt') + style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt') style = etree.parse(style_filename) if is_file: diff --git a/librarian/wl_light.py b/librarian/wl_light.py new file mode 100644 index 0000000..a39e8e0 --- /dev/null +++ b/librarian/wl_light.py @@ -0,0 +1,47 @@ +# -*- encoding: utf-8 -*- + +__author__= "Łukasz Rekucki" +__date__ = "$2009-10-19 16:31:14$" +__doc__ = "Functions to operate on a tag-light version of WLML." + +class LightSerializer(object): + + def __init__(self): + pass + + def serialize(self, element): + handler = getattr(self, 'serialize_' + element.tag, self.identity) + return handler(element) + (element.tail or u'') + + def serialize_slowo_obce(self, e): + return u' %%'+self.descent(e)+u'%% ' + + def descent(self, e): + b = (e.text or u'') + for child in e.iterchildren(): + b += self.serialize(child) + return b + + def identity(self, e): + b = u'<'+e.tag + + # attributes + b += u' '.join((u'%s="%s"' % (attr, value) for attr,value in e.items())) + b += u'>' + b += self.descent(e) + b += u'' + + return b + +_serializer = LightSerializer() + +def serialize_nl(element): + prolog = u'' + element.text # ordinary stuff + data = u'' + + for child in element.iterchildren(): + data += _serializer.serialize(child) + + return prolog + data + + diff --git a/librarian/book2html.xslt b/librarian/xslt/book2html.xslt similarity index 100% rename from librarian/book2html.xslt rename to librarian/xslt/book2html.xslt diff --git a/librarian/book2txt.xslt b/librarian/xslt/book2txt.xslt similarity index 100% rename from librarian/book2txt.xslt rename to librarian/xslt/book2txt.xslt diff --git a/librarian/config.xml b/librarian/xslt/config.xml similarity index 84% rename from librarian/config.xml rename to librarian/xslt/config.xml index e1f4b6f..68f4ea9 100644 --- a/librarian/config.xml +++ b/librarian/xslt/config.xml @@ -1,5 +1,6 @@ - + + @@ -9,16 +10,18 @@ + + - + - + @@ -28,7 +31,7 @@ - + @@ -40,12 +43,12 @@ - + - + @@ -53,13 +56,13 @@ - + - + @@ -94,6 +97,7 @@ + @@ -122,4 +126,4 @@ - \ No newline at end of file + diff --git a/librarian/xslt/normalize.xslt b/librarian/xslt/normalize.xslt new file mode 100755 index 0000000..f5e0314 --- /dev/null +++ b/librarian/xslt/normalize.xslt @@ -0,0 +1,393 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + p + + + + + + + + + + + + + + + + + + dot + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ` + + + + + + ` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Nieprzetworzony węzeł: + + + + + + + + \ No newline at end of file diff --git a/librarian/xslt/wl2fo.xslt b/librarian/xslt/wl2fo.xslt new file mode 100755 index 0000000..fddb2c4 --- /dev/null +++ b/librarian/xslt/wl2fo.xslt @@ -0,0 +1,338 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + —  + + + + + + + + + + + + + + + + + + - + + + always + + + + always + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/librarian/wl2html_base.xslt b/librarian/xslt/wl2html_base.xslt old mode 100644 new mode 100755 similarity index 72% rename from librarian/wl2html_base.xslt rename to librarian/xslt/wl2html_base.xslt index cd31ef1..725abac --- a/librarian/wl2html_base.xslt +++ b/librarian/xslt/wl2html_base.xslt @@ -4,7 +4,6 @@ xmlns="http://www.w3.org/1999/xhtml" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:wl2o="http://nowoczesnapolska.org.pl/WL/2.0/Overlay" xmlns:wl="http://wolnelektury.pl/functions" exclude-result-prefixes="wl" > @@ -16,7 +15,6 @@ indent="yes" omit-xml-declaration = "yes" /> - - - - + + + + - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + - - - - - - Nieznany tag '' :(. - - + + - + Tagi niestandardowe + --> + + + + @@ -176,6 +123,7 @@ + @@ -196,10 +144,11 @@
    - - - - + + + + +
@@ -219,6 +168,7 @@ + @@ -233,7 +183,7 @@ - + @@ -276,7 +226,8 @@ - + + @@ -287,7 +238,7 @@ - + @@ -333,8 +284,9 @@ - + + @@ -344,15 +296,19 @@ + + + + @@ -361,14 +317,39 @@ + - + + + + + + + + + + + +
+

Uwaga! Tekst poza paragrafem

+

##

+
+
+
+
- + + + + + + + + +
-
diff --git a/librarian/wl2html_full.xslt b/librarian/xslt/wl2html_full.xslt old mode 100644 new mode 100755 similarity index 56% rename from librarian/wl2html_full.xslt rename to librarian/xslt/wl2html_full.xslt index deaf0c5..02c89c7 --- a/librarian/wl2html_full.xslt +++ b/librarian/xslt/wl2html_full.xslt @@ -1,9 +1,8 @@ - - + @@ -14,11 +13,11 @@ - - - - - + + + + + diff --git a/librarian/wl2html_partial.xslt b/librarian/xslt/wl2html_partial.xslt old mode 100644 new mode 100755 similarity index 56% rename from librarian/wl2html_partial.xslt rename to librarian/xslt/wl2html_partial.xslt index 0fdca74..ffd65f2 --- a/librarian/wl2html_partial.xslt +++ b/librarian/xslt/wl2html_partial.xslt @@ -4,17 +4,23 @@ - + - + + - Processing... - + + + - - + + + \ No newline at end of file diff --git a/scripts/normalize.py b/scripts/normalize.py new file mode 100755 index 0000000..b927269 --- /dev/null +++ b/scripts/normalize.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +from __future__ import with_statement + +import re +import sys +import os.path + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from StringIO import StringIO +from lxml import etree +import librarian + +REPLACEMENTS = ( + (u'---', u'\u2014'), # mdash + (u'--', u'\u2013'), # ndash + (u'...', u'\u2026'), # ldots + (u',,', u'\u201E'), # lower double back-quote + (u'"', u'\u201D'), # upper double quote +) + +DIALOG_EXPR = re.compile(r"\s*---\s(.*)") + +def wl_normalize_text(context, text): + """XPath extension function converting all entites in passed text.""" + if isinstance(text, list): + text = u''.join(text) + + for code, ucode in REPLACEMENTS: + text = text.replace(code, ucode) + + return text + +def wl_fix_dialog(context, data): + + if isinstance(data, list): + text = u''.join(data) + else: + text = data + + m = DIALOG_EXPR.match(text) + + if m is not None: + return m.group(1) + else: + return text + + +def filter_verse_ends(data): + return data.replace('/\n', '
') + +ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') +ns['normalize-text'] = wl_normalize_text +ns['fix-dialog-line'] = wl_fix_dialog + +def normalize_stylesheet(): + return etree.XSLT(etree.parse(os.path.join(os.path.dirname(librarian.__file__), 'xslt', 'normalize.xslt'))) + +if __name__ == '__main__': + tran = normalize_stylesheet() + input = StringIO( f ) + doc = trans( etree.parse(input) ) + print etree.tostring(doc, pretty_print=True, encoding=unicode).encode('utf-8') + + for err in trans.error_log: + sys.stderr.write( (u"%s\n" % err).encode('utf-8') ) + diff --git a/setup.cfg b/setup.cfg index f2f658c..abee305 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,6 @@ +[egg_info] +tag_build = .dev +tag_date = 1 + [aliases] test = nosetests --detailed-errors --with-doctest --with-coverage --cover-package=librarian diff --git a/setup.py b/setup.py index 34d016e..bad0a3f 100755 --- a/setup.py +++ b/setup.py @@ -8,9 +8,9 @@ from setuptools import setup, find_packages setup( name='librarian', - version='1.2.5', + version='1.3', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', - author='Marek Stępniowski', + author='Marek Stepniowski', author_email='marek@stepniowski.com', url='http://redmine.nowoczesnapolska.org.pl/', packages=find_packages(exclude=['tests']), -- 2.20.1