From f8e5f031c04122d65d1066077be3920ae95518ae Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20St=C4=99pniowski?= Date: Thu, 3 Sep 2009 16:39:08 +0200 Subject: [PATCH 1/1] Stable version 1.2.5. --- .gitignore | 4 +- MANIFEST.in | 4 +- ez_setup.py | 275 +++++++++++++ librarian/__init__.py | 79 +++- librarian/book2html.xslt | 11 +- librarian/config.xml | 125 ++++++ librarian/dcparser.py | 102 +++-- librarian/html.py | 60 ++- librarian/parser.py | 85 +++- librarian/text.py | 23 +- librarian/wl2html_base.xslt | 376 ++++++++++++++++++ librarian/wl2html_full.xslt | 27 ++ librarian/wl2html_partial.xslt | 20 + scripts/book2html | 6 +- scripts/book2ihtml | 54 +++ scripts/book2txt | 7 +- setup.cfg | 2 + setup.py | 20 +- tests/files/text/asnyk_miedzy_nami.xml | 87 ++-- .../text/asnyk_miedzy_nami_expected.html | 47 +++ .../files/text/asnyk_miedzy_nami_expected.txt | 38 ++ ...zy_nami.txt => asnyk_miedzy_nami_nodc.txt} | 0 tests/files/text/asnyk_miedzy_nami_nodc.xml | 25 ++ tests/test_dcparser.py | 74 ++-- tests/test_html.py | 40 ++ tests/test_text.py | 51 ++- tests/utils.py | 67 +--- 27 files changed, 1445 insertions(+), 264 deletions(-) create mode 100644 ez_setup.py create mode 100644 librarian/config.xml create mode 100644 librarian/wl2html_base.xslt create mode 100644 librarian/wl2html_full.xslt create mode 100644 librarian/wl2html_partial.xslt create mode 100755 scripts/book2ihtml create mode 100644 setup.cfg mode change 100644 => 100755 tests/files/text/asnyk_miedzy_nami.xml create mode 100644 tests/files/text/asnyk_miedzy_nami_expected.html create mode 100644 tests/files/text/asnyk_miedzy_nami_expected.txt rename tests/files/text/{asnyk_miedzy_nami.txt => asnyk_miedzy_nami_nodc.txt} (100%) create mode 100644 tests/files/text/asnyk_miedzy_nami_nodc.xml mode change 100755 => 100644 tests/test_dcparser.py create mode 100644 tests/test_html.py mode change 100755 => 100644 tests/test_text.py diff --git a/.gitignore b/.gitignore index bfdc1af..f72688b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,5 @@ MANIFEST dist build -nbproject -nbproject/* +*.egg-info +.coverage diff --git a/MANIFEST.in b/MANIFEST.in index 9b7ec3d..38ee542 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ -include librarian/*.xslt -recursive-include tests/files/ *.xml +include librarian/*.xslt +include librarian/config.xml diff --git a/ez_setup.py b/ez_setup.py new file mode 100644 index 0000000..4a84fea --- /dev/null +++ b/ez_setup.py @@ -0,0 +1,275 @@ +#!python +"""Bootstrap setuptools installation + +If you want to use setuptools in your package's setup.py, just include this +file in the same directory with it, and add this to the top of your setup.py:: + + from ez_setup import use_setuptools + use_setuptools() + +If you want to require a specific version of setuptools, set a download +mirror, or use an alternate download directory, you can do so by supplying +the appropriate options to ``use_setuptools()``. + +This file can also be run as a script to install or upgrade setuptools. +""" +import sys +DEFAULT_VERSION = "0.6c9" +DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] + +md5_data = { + 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', + 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', + 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', + 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', + 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', + 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', + 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', + 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', + 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', + 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', + 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', + 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', + 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', + 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', + 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', + 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', + 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', + 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', + 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', + 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', + 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', + 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', + 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', + 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', + 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', + 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', + 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', + 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', + 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', + 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', + 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', + 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', + 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', + 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', +} + +import sys, os +try: from hashlib import md5 +except ImportError: from md5 import md5 + +def _validate_md5(egg_name, data): + if egg_name in md5_data: + digest = md5(data).hexdigest() + if digest != md5_data[egg_name]: + print >>sys.stderr, ( + "md5 validation of %s failed! (Possible download problem?)" + % egg_name + ) + sys.exit(2) + return data + +def use_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + download_delay=15 +): + """Automatically find/download setuptools and make it available on sys.path + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end with + a '/'). `to_dir` is the directory where setuptools will be downloaded, if + it is not already available. If `download_delay` is specified, it should + be the number of seconds that will be paused before initiating a download, + should one be required. If an older version of setuptools is installed, + this routine will print a message to ``sys.stderr`` and raise SystemExit in + an attempt to abort the calling script. + """ + was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules + def do_download(): + egg = download_setuptools(version, download_base, to_dir, download_delay) + sys.path.insert(0, egg) + import setuptools; setuptools.bootstrap_install_from = egg + try: + import pkg_resources + except ImportError: + return do_download() + try: + pkg_resources.require("setuptools>="+version); return + except pkg_resources.VersionConflict, e: + if was_imported: + print >>sys.stderr, ( + "The required version of setuptools (>=%s) is not available, and\n" + "can't be installed while this script is running. Please install\n" + " a more recent version first, using 'easy_install -U setuptools'." + "\n\n(Currently using %r)" + ) % (version, e.args[0]) + sys.exit(2) + else: + del pkg_resources, sys.modules['pkg_resources'] # reload ok + return do_download() + except pkg_resources.DistributionNotFound: + return do_download() + +def download_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + delay = 15 +): + """Download setuptools from a specified location and return its filename + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end + with a '/'). `to_dir` is the directory where the egg will be downloaded. + `delay` is the number of seconds to pause before an actual download attempt. + """ + import urllib2, shutil + egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) + url = download_base + egg_name + saveto = os.path.join(to_dir, egg_name) + src = dst = None + if not os.path.exists(saveto): # Avoid repeated downloads + try: + from distutils import log + if delay: + log.warn(""" +--------------------------------------------------------------------------- +This script requires setuptools version %s to run (even to display +help). I will attempt to download it for you (from +%s), but +you may need to enable firewall access for this script first. +I will start the download in %d seconds. + +(Note: if this machine does not have network access, please obtain the file + + %s + +and place it in this directory before rerunning this script.) +---------------------------------------------------------------------------""", + version, download_base, delay, url + ); from time import sleep; sleep(delay) + log.warn("Downloading %s", url) + src = urllib2.urlopen(url) + # Read/write all in one block, so we don't create a corrupt file + # if the download is interrupted. + data = _validate_md5(egg_name, src.read()) + dst = open(saveto,"wb"); dst.write(data) + finally: + if src: src.close() + if dst: dst.close() + return os.path.realpath(saveto) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def main(argv, version=DEFAULT_VERSION): + """Install or upgrade setuptools and EasyInstall""" + try: + import setuptools + except ImportError: + egg = None + try: + egg = download_setuptools(version, delay=0) + sys.path.insert(0,egg) + from setuptools.command.easy_install import main + return main(list(argv)+[egg]) # we're done here + finally: + if egg and os.path.exists(egg): + os.unlink(egg) + else: + if setuptools.__version__ == '0.0.1': + print >>sys.stderr, ( + "You have an obsolete version of setuptools installed. Please\n" + "remove it from your system entirely before rerunning this script." + ) + sys.exit(2) + + req = "setuptools>="+version + import pkg_resources + try: + pkg_resources.require(req) + except pkg_resources.VersionConflict: + try: + from setuptools.command.easy_install import main + except ImportError: + from easy_install import main + main(list(argv)+[download_setuptools(delay=0)]) + sys.exit(0) # try to force an exit + else: + if argv: + from setuptools.command.easy_install import main + main(argv) + else: + print "Setuptools version",version,"or greater has been installed." + print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' + +def update_md5(filenames): + """Update our built-in md5 registry""" + + import re + + for name in filenames: + base = os.path.basename(name) + f = open(name,'rb') + md5_data[base] = md5(f.read()).hexdigest() + f.close() + + data = [" %r: %r,\n" % it for it in md5_data.items()] + data.sort() + repl = "".join(data) + + import inspect + srcfile = inspect.getsourcefile(sys.modules[__name__]) + f = open(srcfile, 'rb'); src = f.read(); f.close() + + match = re.search("\nmd5_data = {\n([^}]+)}", src) + if not match: + print >>sys.stderr, "Internal error!" + sys.exit(2) + + src = src[:match.start(1)] + repl + src[match.end(1):] + f = open(srcfile,'w') + f.write(src) + f.close() + + +if __name__=='__main__': + if len(sys.argv)>2 and sys.argv[1]=='--md5update': + update_md5(sys.argv[2:]) + else: + main(sys.argv[1:]) + + + + + diff --git a/librarian/__init__.py b/librarian/__init__.py index 9132f5c..5997a4e 100644 --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -2,10 +2,87 @@ # exception classes class ParseError(Exception): - pass + + def __init__(self, cause, message=None): + self.cause = cause + try: + self.message = message or self.cause.message + except: + self.message = "No message." class ValidationError(Exception): pass class NoDublinCore(ValidationError): pass + +class XMLNamespace(object): + '''A handy structure to repsent names in an XML namespace.''' + + def __init__(self, uri): + self.uri = uri + + def __call__(self, tag): + return '{%s}%s' % (self.uri, tag) + + def __contains__(self, tag): + return tag.startswith('{'+str(self)+'}') + + def __repr__(self): + return 'XMLNamespace(%r)' % self.uri + + def __str__(self): + return '%s' % self.uri + +class EmptyNamespace(XMLNamespace): + def __init__(self): + super(EmptyNamespace, self).__init__('') + + def __call__(self, tag): + return tag + +# some common namespaces we use +RDFNS = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') +DCNS = XMLNamespace('http://purl.org/dc/elements/1.1/') +XINS = XMLNamespace("http://www.w3.org/2001/XInclude") +XHTMLNS = XMLNamespace("http://www.w3.org/1999/xhtml") + +WLNS = EmptyNamespace() + +import lxml.etree as etree +import dcparser + +DEFAULT_BOOKINFO = dcparser.BookInfo( + { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'},\ + { DCNS('creator'): [u'Some, Author'], + DCNS('title'): [u'Some Title'], + DCNS('subject.period'): [u'Unknown'], + DCNS('subject.type'): [u'Unknown'], + DCNS('subject.genre'): [u'Unknown'], + DCNS('date'): ['1970-01-01'], + # DCNS('date'): [creation_date], + DCNS('publisher'): [u"Fundacja Nowoczesna Polska"], + DCNS('description'): + [u"""Publikacja zrealizowana w ramach projektu + Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa + wykonana przez Bibliotekę Narodową z egzemplarza + pochodzącego ze zbiorów BN."""], + DCNS('identifier.url'): + [u"http://wolnelektury.pl/katalog/lektura/template"], + DCNS('rights'): + [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"] }) + +def xinclude_forURI(uri): + e = etree.Element( XINS("include") ) + e.set("href", uri) + return etree.tostring(e, encoding=unicode) + +def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO): + """Wrap the text within the minimal XML structure with a DC template.""" + bookinfo.created_at = creation_date + + dcstring = etree.tostring(bookinfo.to_etree(),\ + method='xml', encoding=unicode, pretty_print=True) + + return u'\n' + dcstring + u'\n\n' + ocrtext +\ + u'\n\n'; \ No newline at end of file diff --git a/librarian/book2html.xslt b/librarian/book2html.xslt index 71f1182..369b542 100644 --- a/librarian/book2html.xslt +++ b/librarian/book2html.xslt @@ -414,11 +414,11 @@ - + - + @@ -456,11 +456,11 @@ -

+

-
+
@@ -611,5 +611,4 @@ - - + \ No newline at end of file diff --git a/librarian/config.xml b/librarian/config.xml new file mode 100644 index 0000000..e1f4b6f --- /dev/null +++ b/librarian/config.xml @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
\ No newline at end of file diff --git a/librarian/dcparser.py b/librarian/dcparser.py index 830b089..80d6247 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -3,7 +3,7 @@ from xml.parsers.expat import ExpatError from datetime import date import time -from librarian import ValidationError, NoDublinCore +from librarian import ValidationError, NoDublinCore, ParseError, DCNS, RDFNS import lxml.etree as etree # ElementTree API using libxml2 from lxml.etree import XMLSyntaxError @@ -105,54 +105,31 @@ class Field(object): return self.validate_value(f) -# ========== -# = Parser = -# ========== -class XMLNamespace(object): - '''Represents XML namespace.''' - - def __init__(self, uri): - self.uri = uri - - def __call__(self, tag): - return '{%s}%s' % (self.uri, tag) - def __contains__(self, tag): - return tag.startswith(str(self)) - def __repr__(self): - return 'XMLNamespace(%r)' % self.uri - - def __str__(self): - return '%s' % self.uri - - -class BookInfo(object): - RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') - DC = XMLNamespace('http://purl.org/dc/elements/1.1/') - +class BookInfo(object): FIELDS = ( - Field( DC('creator'), 'author', as_person), - Field( DC('title'), 'title'), - Field( DC('subject.period'), 'epoches', salias='epoch', multiple=True), - Field( DC('subject.type'), 'kinds', salias='kind', multiple=True), - Field( DC('subject.genre'), 'genres', salias='genre', multiple=True), - Field( DC('date'), 'created_at', as_date), - Field( DC('date.pd'), 'released_to_public_domain_at', as_date, required=False), - Field( DC('contributor.editor'), 'editors', \ + Field( DCNS('creator'), 'author', as_person), + Field( DCNS('title'), 'title'), + Field( DCNS('subject.period'), 'epochs', salias='epoch', multiple=True), + Field( DCNS('subject.type'), 'kinds', salias='kind', multiple=True), + Field( DCNS('subject.genre'), 'genres', salias='genre', multiple=True), + Field( DCNS('date'), 'created_at', as_date), + Field( DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), + Field( DCNS('contributor.editor'), 'editors', \ as_person, salias='editor', multiple=True, default=[]), - Field( DC('contributor.translator'), 'translators', \ + Field( DCNS('contributor.translator'), 'translators', \ as_person, salias='translator', multiple=True, default=[]), - Field( DC('contributor.technical_editor'), 'technical_editors', + Field( DCNS('contributor.technical_editor'), 'technical_editors', as_person, salias='technical_editor', multiple=True, default=[]), - Field( DC('publisher'), 'publisher'), - Field( DC('source'), 'source_name', required=False), - Field( DC('source.URL'), 'source_url', required=False), - Field( DC('identifier.url'), 'url'), - Field( DC('relation.hasPart'), 'parts', multiple=True, required=False), - Field( DC('rights.license'), 'license', required=False), - Field( DC('rights'), 'license_description'), + Field( DCNS('publisher'), 'publisher'), + Field( DCNS('source'), 'source_name', required=False), + Field( DCNS('source.URL'), 'source_url', required=False), + Field( DCNS('identifier.url'), 'url'), + Field( DCNS('relation.hasPart'), 'parts', multiple=True, required=False), + Field( DCNS('rights.license'), 'license', required=False), + Field( DCNS('rights'), 'license_description'), ) @classmethod @@ -166,7 +143,7 @@ class BookInfo(object): try: iter = etree.iterparse(xmlfile, ['start', 'end']) for (event, element) in iter: - if element.tag == cls.RDF('RDF') and event == 'start': + if element.tag == RDFNS('RDF') and event == 'start': desc_tag = element break @@ -176,7 +153,7 @@ class BookInfo(object): # continue 'till the end of RDF section for (event, element) in iter: - if element.tag == cls.RDF('RDF') and event == 'end': + if element.tag == RDFNS('RDF') and event == 'end': break # if there is no end, Expat should yell at us with an ExpatError @@ -192,7 +169,7 @@ class BookInfo(object): def from_element(cls, rdf_tag): # the tree is already parsed, so we don't need to worry about Expat errors field_dict = {} - desc = rdf_tag.find(".//" + cls.RDF('Description') ) + desc = rdf_tag.find(".//" + RDFNS('Description') ) if desc is None: raise NoDublinCore("No DublinCore section found.") @@ -202,14 +179,14 @@ class BookInfo(object): fv.append(e.text) field_dict[e.tag] = fv - return cls( desc.attrib, field_dict ) + return cls( desc.attrib, field_dict ) def __init__(self, rdf_attrs, dc_fields): """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description. dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the given field. """ - self.about = rdf_attrs.get(self.RDF('about')) + self.about = rdf_attrs.get(RDFNS('about')) self.fmap = {} for field in self.FIELDS: @@ -258,14 +235,14 @@ class BookInfo(object): #etree._namespace_map[str(self.DC)] = 'dc' if parent is None: - root = etree.Element(self.RDF('RDF')) + root = etree.Element(RDFNS('RDF')) else: - root = parent.makeelement(self.RDF('RDF')) + root = parent.makeelement(RDFNS('RDF')) - description = etree.SubElement(root, self.RDF('Description')) + description = etree.SubElement(root, RDFNS('Description')) if self.about: - description.set(self.RDF('about'), self.about) + description.set(RDFNS('about'), self.about) for field in self.FIELDS: v = getattr(self, field.name, None) @@ -283,6 +260,25 @@ class BookInfo(object): return root + + def serialize(self): + rdf = {} + rdf['about'] = { 'uri': RDFNS('about'), 'value': self.about } + + dc = {} + for field in self.FIELDS: + v = getattr(self, field.name, None) + if v is not None: + if field.multiple: + if len(v) == 0: continue + v = [ unicode(x) for x in v if v is not None ] + else: + v = unicode(v) + + dc[field.name] = {'uri': field.uri, 'value': v} + rdf['fields'] = dc + return rdf + def to_dict(self): result = {'about': self.about} for field in self.FIELDS: @@ -291,14 +287,14 @@ class BookInfo(object): if v is not None: if field.multiple: if len(v) == 0: continue - v = [ unicode(x) for x in v ] + v = [ unicode(x) for x in v if v is not None ] else: v = unicode(v) result[field.name] = v if field.salias: v = getattr(self, field.salias) - if v is not None: result[field.salias] = v + if v is not None: result[field.salias] = unicode(v) return result diff --git a/librarian/html.py b/librarian/html.py index 4edbf33..6551995 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -6,7 +6,9 @@ import copy from lxml import etree from librarian.parser import WLDocument +from librarian import XHTMLNS, ParseError +from lxml.etree import XMLSyntaxError, XSLTApplyError ENTITY_SUBSTITUTIONS = [ (u'---', u'—'), @@ -16,6 +18,14 @@ ENTITY_SUBSTITUTIONS = [ (u'"', u'”'), ] +STYLESHEETS = { + 'legacy': 'book2html.xslt', + 'full': 'wl2html_full.xslt', + 'partial': 'wl2html_partial.xslt' +} + +def get_stylesheet(name): + return os.path.join(os.path.dirname(__file__), STYLESHEETS[name]) def substitute_entities(context, text): """XPath extension function converting all entites in passed text.""" @@ -25,38 +35,44 @@ def substitute_entities(context, text): text = text.replace(entity, substitutution) return text - # Register substitute_entities function with lxml ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') ns['substitute_entities'] = substitute_entities - -def transform(input, output_filename=None, is_file=True): +def transform(input, output_filename=None, is_file=True, \ + parse_dublincore=True, stylesheet='legacy', options={}): """Transforms file input_filename in XML to output_filename in XHTML.""" # Parse XSLT - style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt') - style = etree.parse(style_filename) + try: + style_filename = get_stylesheet(stylesheet) + style = etree.parse(style_filename) - if is_file: - document = WLDocument.from_file(input, True) - else: - document = WLDocument.from_string(input, True) - - result = document.transform(style) - del document # no longer needed large object :) + if is_file: + document = WLDocument.from_file(input, True, \ + parse_dublincore=parse_dublincore) + else: + document = WLDocument.from_string(input, True, \ + parse_dublincore=parse_dublincore) - if result.find('//p') is not None: - add_anchors(result.getroot()) - add_table_of_contents(result.getroot()) + result = document.transform(style, **options) + del document # no longer needed large object :) - if output_filename is not None: - result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8') + if etree.ETXPath('//p|//{%s}p' % str(XHTMLNS))(result) is not None: + add_anchors(result.getroot()) + add_table_of_contents(result.getroot()) + + if output_filename is not None: + result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8') + else: + return result + return True else: - return result - return True - else: - return False - + print "[Librarian] didn't find any paragraphs" + return "" + except KeyError: + raise ValueError("'%s' is not a valid stylesheet.") + except (XMLSyntaxError, XSLTApplyError), e: + raise ParseError(e) class Fragment(object): def __init__(self, id, themes): diff --git a/librarian/parser.py b/librarian/parser.py index 595dd97..55b4e4b 100644 --- a/librarian/parser.py +++ b/librarian/parser.py @@ -1,8 +1,11 @@ # -*- coding: utf-8 -*- -from librarian import ValidationError, NoDublinCore, dcparser, ParseError +from librarian import ValidationError, NoDublinCore, ParseError +from librarian import RDFNS, DCNS +from librarian import dcparser + from xml.parsers.expat import ExpatError from lxml import etree -from lxml.etree import XMLSyntaxError +from lxml.etree import XMLSyntaxError, XSLTApplyError import re from StringIO import StringIO @@ -10,29 +13,32 @@ from StringIO import StringIO class WLDocument(object): LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE); - def __init__(self, edoc): + def __init__(self, edoc, parse_dublincore=True): self.edoc = edoc root_elem = edoc.getroot() - rdf_ns = dcparser.BookInfo.RDF - dc_path = './/' + rdf_ns('RDF') + + dc_path = './/' + RDFNS('RDF') if root_elem.tag != 'utwor': raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag) - self.rdf_elem = root_elem.find(dc_path) - - if self.rdf_elem is None: - raise NoDublinCore('Document has no DublinCore - which is required.') - - self.book_info = dcparser.BookInfo.from_element(self.rdf_elem) + if parse_dublincore: + self.rdf_elem = root_elem.find(dc_path) + if self.rdf_elem is None: + raise NoDublinCore('Document has no DublinCore - which is required.') + + self.book_info = dcparser.BookInfo.from_element(self.rdf_elem) + else: + self.book_info = None + @classmethod - def from_string(cls, xml, swap_endlines=False): - return cls.from_file(StringIO(xml), swap_endlines) + def from_string(cls, xml, swap_endlines=False, parse_dublincore=True): + return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore) @classmethod - def from_file(cls, xmlfile, swap_endlines=False): + def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True): # first, prepare for parsing if isinstance(xmlfile, basestring): @@ -52,11 +58,38 @@ class WLDocument(object): try: parser = etree.XMLParser(remove_blank_text=True) - return cls( etree.parse(StringIO(data), parser) ) - except XMLSyntaxError, e: - raise ParseError(e.message) - except ExpatError, e: - raise ParseError(e.message) + return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore) + except (ExpatError, XMLSyntaxError, XSLTApplyError), e: + raise ParseError(e) + + def part_as_text(self, path): + # convert the path to XPath + print "[L] Retrieving part:", path + + elems = self.edoc.xpath(self.path_to_xpath(path)) + print "[L] xpath", elems + + if len(elems) == 0: + return None + + return etree.tostring(elems[0], encoding=unicode, pretty_print=True) + + + def path_to_xpath(self, path): + parts = [] + + for part in path.split('/'): + match = re.match(r'([^\[]+)\[(\d+)\]', part) + if not match: + parts.append(part) + else: + tag, n = match.groups() + parts.append("node()[position() = %d and name() = '%s']" % (int(n), tag) ) + + if parts[0] == '.': + parts[0] = '' + + return '/'.join(parts) def transform(self, stylesheet, **options): return self.edoc.xslt(stylesheet, **options) @@ -68,3 +101,17 @@ class WLDocument(object): def serialize(self): self.update_dc() return etree.tostring(self.edoc, encoding=unicode, pretty_print=True) + + def merge_chunks(self, chunk_dict): + unmerged = [] + + for key, data in chunk_dict.iteritems(): + try: + xpath = self.path_to_xpath(key) + node = self.edoc.xpath(xpath)[0] + repl = etree.fromstring(data) + node.getparent().replace(node, repl); + except Exception, e: + unmerged.append( repr( (key, xpath, e) ) ) + + return unmerged \ No newline at end of file diff --git a/librarian/text.py b/librarian/text.py index 0754a99..972dd61 100644 --- a/librarian/text.py +++ b/librarian/text.py @@ -1,12 +1,10 @@ # -*- coding: utf-8 -*- -import os +from librarian import dcparser, parser +from lxml import etree import cStringIO -import re import codecs - -from lxml import etree - -from librarian import dcparser +import os +import re ENTITY_SUBSTITUTIONS = [ @@ -78,22 +76,27 @@ ns['substitute_entities'] = substitute_entities ns['wrap_words'] = wrap_words -def transform(input_filename, output_filename, **options): +def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options): """Transforms file input_filename in XML to output_filename in TXT.""" # Parse XSLT style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt') style = etree.parse(style_filename) if is_file: - document = WLDocument.from_file(input, True) + document = parser.WLDocument.from_file(input_filename, True, parse_dublincore=parse_dublincore) else: - document = WLDocument.from_string(input, True) + document = parser.WLDocument.from_string(input_filename, True, parse_dublincore=parse_dublincore) result = document.transform(style, **options) output_file = codecs.open(output_filename, 'wb', encoding='utf-8') + + if parse_dublincore: + url = dcparser.parse(input_filename).url + else: + url = '*' * 10 output_file.write(TEMPLATE % { - 'url': dcparser.parse(input_filename).url, + 'url': url, 'text': unicode(result), }) diff --git a/librarian/wl2html_base.xslt b/librarian/wl2html_base.xslt new file mode 100644 index 0000000..cd31ef1 --- /dev/null +++ b/librarian/wl2html_base.xslt @@ -0,0 +1,376 @@ + + + + + + + + + + + + + + + + + + + + + + + editable + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Nieznany tag '' :(. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/librarian/wl2html_full.xslt b/librarian/wl2html_full.xslt new file mode 100644 index 0000000..deaf0c5 --- /dev/null +++ b/librarian/wl2html_full.xslt @@ -0,0 +1,27 @@ + + + + + + + + + + +
+ + + + + + + + + + +
+
+ +
\ No newline at end of file diff --git a/librarian/wl2html_partial.xslt b/librarian/wl2html_partial.xslt new file mode 100644 index 0000000..0fdca74 --- /dev/null +++ b/librarian/wl2html_partial.xslt @@ -0,0 +1,20 @@ + + + + + + + + + + + Processing... + + + + + + + \ No newline at end of file diff --git a/scripts/book2html b/scripts/book2html index 02f2fa7..5594223 100755 --- a/scripts/book2html +++ b/scripts/book2html @@ -14,7 +14,9 @@ if __name__ == '__main__': parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='print status messages to stdout') - + parser.add_option('-i', '--ignore-dublin-core', action='store_false', dest='parse_dublincore', default=True, + help='don\'t try to parse dublin core metadata') + options, input_filenames = parser.parse_args() if len(input_filenames) < 1: @@ -28,7 +30,7 @@ if __name__ == '__main__': output_filename = os.path.splitext(input_filename)[0] + '.html' try: - html.transform(input_filename, output_filename) + html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': input_filename, diff --git a/scripts/book2ihtml b/scripts/book2ihtml new file mode 100755 index 0000000..2f94be9 --- /dev/null +++ b/scripts/book2ihtml @@ -0,0 +1,54 @@ +#!/usr/bin/env python +import os +import optparse + +from librarian import html, ParseError + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Convert SOURCE files to HTML format.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + parser.add_option('-i', '--ignore-dublin-core', action='store_false', dest='parse_dublincore', default=True, + help='don\'t try to parse dublin core metadata') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.html' + try: + html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore,\ + stylesheet='partial') + except ParseError, e: + print '%(file)s:%(name)s:%(message)s' % { + 'file': input_filename, + 'name': e.__class__.__name__, + 'message': e.message.encode('utf-8') + } + except IOError, e: + print '%(file)s:%(name)s:%(message)s' % { + 'file': input_filename, + 'name': e.__class__.__name__, + 'message': e.strerror, + } + except BaseException, e: + print '%(file)s:%(etype)s:%(message)s' % { + 'file': input_filename, + 'etype': e.__class__.__name__, + 'message': e.message.encode('utf-8'), + } + raise e + diff --git a/scripts/book2txt b/scripts/book2txt index 41a3978..55482a6 100755 --- a/scripts/book2txt +++ b/scripts/book2txt @@ -17,7 +17,9 @@ if __name__ == '__main__': help='print status messages to stdout') parser.add_option('-w', '--wrap', action='store', type='int', dest='wrapping', default=0, help='set line wrap column') - + parser.add_option('-i', '--ignore-dublin-core', action='store_false', dest='parse_dublincore', default=True, + help='don\'t try to parse dublin core metadata') + options, input_filenames = parser.parse_args() if len(input_filenames) < 1: @@ -31,7 +33,8 @@ if __name__ == '__main__': output_filename = os.path.splitext(input_filename)[0] + '.txt' try: - text.transform(input_filename, output_filename, wrapping=str(options.wrapping)) + text.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore, + wrapping=str(options.wrapping)) except ParseError, e: print '%(file)s:%(name)s:%(message)s' % { 'file': input_filename, diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..f2f658c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test = nosetests --detailed-errors --with-doctest --with-coverage --cover-package=librarian diff --git a/setup.py b/setup.py index 09bb42b..34d016e 100755 --- a/setup.py +++ b/setup.py @@ -1,21 +1,21 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from distutils.core import setup -from tests.utils import TestCommand +from ez_setup import use_setuptools +use_setuptools() + +from setuptools import setup, find_packages + setup( name='librarian', - version='1.2.1', + version='1.2.5', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', author='Marek Stępniowski', author_email='marek@stepniowski.com', url='http://redmine.nowoczesnapolska.org.pl/', - packages=['librarian', 'tests'], - package_dir={'librarian': 'librarian', 'tests': 'tests'}, - package_data={ - 'librarian': ['*.xslt'], - 'tests': ['files/dcparser/*.xml', 'files/erroneous/*.xml'], - }, + packages=find_packages(exclude=['tests']), + include_package_data=True, + install_requires=['lxml>=2.2'], scripts=['scripts/book2html', 'scripts/book2txt', 'scripts/bookfragments', 'scripts/genslugs'], - cmdclass={'test': TestCommand}, + tests_require=['nose>=0.11', 'coverage>=3.0.1'], ) diff --git a/tests/files/text/asnyk_miedzy_nami.xml b/tests/files/text/asnyk_miedzy_nami.xml old mode 100644 new mode 100755 index 5716a28..d7ab4fc --- a/tests/files/text/asnyk_miedzy_nami.xml +++ b/tests/files/text/asnyk_miedzy_nami.xml @@ -1,25 +1,64 @@ - - Adam Asnyk - Między nami nic nie było - - Między nami nic nie było!/ - Żadnych zwierzeń, wyznań żadnych!/ - Nic nas z sobą nie łączyło ---/ - Prócz wiosennych marzeń zdradnych; - - Prócz tych woni, barw i blasków,/ - Unoszących się w przestrzeni;/ - Prócz szumiących śpiewem lasków/ - I tej świeżej łąk zieleni; - - Prócz tych kaskad i potoków,/ - Zraszających każdy parów,/ - Prócz girlandy tęcz, obłoków,/ - Prócz natury słodkich czarów; - - Prócz tych wspólnych, jasnych zdrojów,/ - Z których serce zachwyt piło;/ - Prócz pierwiosnków i powojów,---/ - Między nami nic nie było! - + + + + + +Asnyk, Adam +Między nami nic nie było +Sekuła, Aleksandra +Sutkowska, Olga +Fundacja Nowoczesna Polska +Pozytywizm +Liryka +Wiersz +Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. +http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo +http://www.polona.pl/Content/5164 +(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898 +Domena publiczna - Adam Asnyk zm. 1897 +1897 +xml +text +text +2007-09-06 +L +pol + + + + +Adam Asnyk + +Miłość platonicznaMiędzy nami nic nie było + + + +Między nami nic nie było!/ +Żadnych zwierzeń, wyznań żadnych!/ +Nic nas z sobą nie łączyło ---/ +Prócz wiosennych marzeń zdradnych; + + + +NaturaPrócz tych woni, barw i blasków,/ +Unoszących się w przestrzeni;/ +Prócz szumiących śpiewem lasków/ +I tej świeżej łąk zieleni; + + + +Prócz tych kaskad i potoków,/ +Zraszających każdy parów,/ +Prócz girlandy tęcz, obłoków,/ +Prócz natury słodkich czarów; + + + +Prócz tych wspólnych, jasnych zdrojów,/ +Z których serce zachwyt piło;/ +Prócz pierwiosnków i powojów,---/ +Między nami nic nie było! + + + diff --git a/tests/files/text/asnyk_miedzy_nami_expected.html b/tests/files/text/asnyk_miedzy_nami_expected.html new file mode 100644 index 0000000..49a2691 --- /dev/null +++ b/tests/files/text/asnyk_miedzy_nami_expected.html @@ -0,0 +1,47 @@ +
+
+

Spis treści

+
    +
+

+ Adam Asnyk + Miłość platonicznaMiędzy nami nic nie było +

+ Miłość platoniczna +
+

1Między nami nic nie było!

+

+Żadnych zwierzeń, wyznań żadnych!

+

+Nic nas z sobą nie łączyło —

+

+Prócz wiosennych marzeń zdradnych;

+
+
+

5NaturaPrócz tych woni, barw i blasków,

+

+Unoszących się w przestrzeni;

+

+Prócz szumiących śpiewem lasków

+

+I tej świeżej łąk zieleni;

+
+
+

Prócz tych kaskad i potoków,

+

10 +Zraszających każdy parów,

+

+Prócz girlandy tęcz, obłoków,

+

+Prócz natury słodkich czarów;

+
+
+

Prócz tych wspólnych, jasnych zdrojów,

+

+Z których serce zachwyt piło;

+

15 +Prócz pierwiosnków i powojów,—

+

+Między nami nic nie było!

+
+
diff --git a/tests/files/text/asnyk_miedzy_nami_expected.txt b/tests/files/text/asnyk_miedzy_nami_expected.txt new file mode 100644 index 0000000..6e54969 --- /dev/null +++ b/tests/files/text/asnyk_miedzy_nami_expected.txt @@ -0,0 +1,38 @@ +Kodowanie znaków w dokumencie: UTF-8. +----- +Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez +Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje +się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać. + +Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo. +----- + + + + + +Adam Asnyk + +Między nami nic nie było + + + +Między nami nic nie było! +Żadnych zwierzeń, wyznań żadnych! +Nic nas z sobą nie łączyło — +Prócz wiosennych marzeń zdradnych; + +Prócz tych woni, barw i blasków, +Unoszących się w przestrzeni; +Prócz szumiących śpiewem lasków +I tej świeżej łąk zieleni; + +Prócz tych kaskad i potoków, +Zraszających każdy parów, +Prócz girlandy tęcz, obłoków, +Prócz natury słodkich czarów; + +Prócz tych wspólnych, jasnych zdrojów, +Z których serce zachwyt piło; +Prócz pierwiosnków i powojów,— +Między nami nic nie było! diff --git a/tests/files/text/asnyk_miedzy_nami.txt b/tests/files/text/asnyk_miedzy_nami_nodc.txt similarity index 100% rename from tests/files/text/asnyk_miedzy_nami.txt rename to tests/files/text/asnyk_miedzy_nami_nodc.txt diff --git a/tests/files/text/asnyk_miedzy_nami_nodc.xml b/tests/files/text/asnyk_miedzy_nami_nodc.xml new file mode 100644 index 0000000..5716a28 --- /dev/null +++ b/tests/files/text/asnyk_miedzy_nami_nodc.xml @@ -0,0 +1,25 @@ + + + Adam Asnyk + Między nami nic nie było + + Między nami nic nie było!/ + Żadnych zwierzeń, wyznań żadnych!/ + Nic nas z sobą nie łączyło ---/ + Prócz wiosennych marzeń zdradnych; + + Prócz tych woni, barw i blasków,/ + Unoszących się w przestrzeni;/ + Prócz szumiących śpiewem lasków/ + I tej świeżej łąk zieleni; + + Prócz tych kaskad i potoków,/ + Zraszających każdy parów,/ + Prócz girlandy tęcz, obłoków,/ + Prócz natury słodkich czarów; + + Prócz tych wspólnych, jasnych zdrojów,/ + Z których serce zachwyt piło;/ + Prócz pierwiosnków i powojów,---/ + Między nami nic nie było! + diff --git a/tests/test_dcparser.py b/tests/test_dcparser.py old mode 100755 new mode 100644 index 62e664c..fcbc363 --- a/tests/test_dcparser.py +++ b/tests/test_dcparser.py @@ -1,56 +1,44 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- - -import unittest - +from librarian import dcparser from lxml import etree -from utils import get_file_path -from librarian import dcparser, html, ParseError -from utils import AutoTestMetaclass - -class TestDCParser(unittest.TestCase): - __metaclass__ = AutoTestMetaclass +from nose.tools import * +from os.path import splitext +from tests.utils import get_all_fixtures +import codecs - TEST_DIR = 'dcparser' - def run_auto_test(self, in_data, out_data): - info = dcparser.BookInfo.from_string(in_data).to_dict() - should_be = eval(out_data) - for key in should_be: - self.assertEqual( info[key], should_be[key] ) +def check_dcparser(xml_file, result_file): + xml = file(xml_file).read() + result = codecs.open(result_file, encoding='utf-8').read() + info = dcparser.BookInfo.from_string(xml).to_dict() + should_be = eval(result) + for key in should_be: + assert_equals(info[key], should_be[key]) -class TestDCSerialize(unittest.TestCase): - __metaclass__ = AutoTestMetaclass - TEST_DIR = 'dcserialize' +def test_dcparser(): + for fixture in get_all_fixtures('dcparser', '*.xml'): + base_name = splitext(fixture)[0] + yield check_dcparser, fixture, base_name + '.out' - def run_auto_test(self, in_data, out_data): - import lxml.etree - # first parse the input - info = dcparser.BookInfo.from_string(in_data) - # serialize - serialized = lxml.etree.tostring(info.to_etree(), encoding=unicode).encode('utf-8') +def check_serialize(xml_file): + xml = file(xml_file).read() + info = dcparser.BookInfo.from_string(xml) - # then parse again - info_bis = dcparser.BookInfo.from_string(serialized) + # serialize + serialized = etree.tostring(info.to_etree(), encoding=unicode).encode('utf-8') + # then parse again + info_bis = dcparser.BookInfo.from_string(serialized) - # check if they are the same - for key in vars(info): - self.assertEqual( getattr(info, key), getattr(info_bis, key)) + # check if they are the same + for key in vars(info): + assert_equals(getattr(info, key), getattr(info_bis, key)) + for key in vars(info_bis): + assert_equals(getattr(info, key), getattr(info_bis, key)) - for key in vars(info_bis): - self.assertEqual( getattr(info, key), getattr(info_bis, key)) -class TestParserErrors(unittest.TestCase): - def test_error(self): - try: - html.transform(get_file_path('erroneous', 'asnyk_miedzy_nami.xml'), - get_file_path('erroneous', 'asnyk_miedzy_nami.html')) - self.fail() - except ParseError: - pass - #self.assertEqual(e.position, (25, 13)) +def test_serialize(): + for fixture in get_all_fixtures('dcparser', '*.xml'): + yield check_serialize, fixture -if __name__ == '__main__': - unittest.main() diff --git a/tests/test_html.py b/tests/test_html.py new file mode 100644 index 0000000..86fcfac --- /dev/null +++ b/tests/test_html.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +from librarian import html, NoDublinCore +from nose.tools import * +from utils import get_fixture, remove_output_file + + +def teardown_transform(): + remove_output_file('text', 'asnyk_miedzy_nami.html') + + +@with_setup(None, teardown_transform) +def test_transform(): + output_file_path = get_fixture('text', 'asnyk_miedzy_nami.html') + expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html') + + html.transform( + get_fixture('text', 'asnyk_miedzy_nami.xml'), + output_file_path, + ) + + assert_equal(file(output_file_path).read(), file(expected_output_file_path).read()) + + +@with_setup(None, teardown_transform) +@raises(NoDublinCore) +def test_no_dublincore(): + html.transform( + get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), + get_fixture('text', 'asnyk_miedzy_nami_nodc.html'), + ) + + +@with_setup(None, teardown_transform) +def test_passing_parse_dublincore_to_transform(): + """Passing parse_dublincore=False to transform omits DublinCore parsing.""" + html.transform( + get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), + get_fixture('text', 'asnyk_miedzy_nami.html'), + parse_dublincore=False, + ) diff --git a/tests/test_text.py b/tests/test_text.py old mode 100755 new mode 100644 index 00fd787..020c571 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,22 +1,41 @@ -#!/usr/bin/env python -# encoding: utf-8 +# -*- coding: utf-8 -*- +from librarian import text, NoDublinCore +from nose.tools import * +from utils import get_fixture, remove_output_file -import unittest -from utils import get_file_path -from librarian import dcparser -from librarian import text, NoDublinCore +def teardown_transform(): + remove_output_file('text', 'asnyk_miedzy_nami.txt') + + +@with_setup(None, teardown_transform) +def test_transform(): + output_file_path = get_fixture('text', 'asnyk_miedzy_nami.txt') + expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt') + + text.transform( + get_fixture('text', 'asnyk_miedzy_nami.xml'), + output_file_path, + ) + + assert_equal(file(output_file_path).read(), file(expected_output_file_path).read()) -class TestXML(unittest.TestCase): - def test_no_dublincore(self): - try: - text.transform(get_file_path('text', 'asnyk_miedzy_nami.xml'), - get_file_path('text', 'asnyk_miedzy_nami.txt')) - self.fail() - except NoDublinCore, e: - pass +@with_setup(None, teardown_transform) +@raises(NoDublinCore) +def test_no_dublincore(): + text.transform( + get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), + get_fixture('text', 'asnyk_miedzy_nami_nodc.txt'), + ) -if __name__ == '__main__': - unittest.main() +@with_setup(None, teardown_transform) +def test_passing_parse_dublincore_to_transform(): + """Passing parse_dublincore=False to transform omits DublinCore parsing.""" + text.transform( + get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), + get_fixture('text', 'asnyk_miedzy_nami.txt'), + parse_dublincore=False, + ) + \ No newline at end of file diff --git a/tests/utils.py b/tests/utils.py index 1870a07..bea2038 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,62 +1,25 @@ -from __future__ import with_statement - +from os.path import realpath, join, dirname +import glob import os -from distutils.core import Command -from unittest import TextTestRunner, TestLoader -from glob import glob -from os.path import dirname, join, realpath, splitext, basename, walk -from os import listdir -import codecs - -class AutoTestMetaclass(type): - - def __new__(cls, name, bases, class_dict): - test_dir = class_dict.pop('TEST_DIR') - path = realpath( join(dirname(__file__), 'files', test_dir) ) - for file in listdir(path): - base, ext = splitext(file) - if ext != '.xml': - continue - class_dict['test_'+base] = cls.make_test_runner(base, \ - join(path, base +'.xml'), join(path, base + '.out') ) +def get_fixture_dir(dir_name): + """Returns path to fixtures directory dir_name.""" + return realpath(join(dirname(__file__), 'files', dir_name)) - return type.__new__(cls, name, bases, class_dict) - - @staticmethod - def make_test_runner(name, inputf, outputf): - def runner(self): - with open(inputf, 'rb') as ifd: - with codecs.open(outputf, 'rb', encoding='utf-8') as ofd: - self.run_auto_test(ifd.read(), ofd.read()) - return runner +def get_fixture(dir_name, file_name): + """Returns path to fixture file_name in directory dir_name.""" + return join(get_fixture_dir(dir_name), file_name) -def get_file_path(dir_name, file_name): - return realpath(join(dirname(__file__), 'files', dir_name, file_name)) -class TestCommand(Command): - user_options = [] +def get_all_fixtures(dir_name, glob_pattern='*'): + """Returns list of paths for fixtures in directory dir_name matching the glob_pattern.""" + return [get_fixture(dir_name, file_name) for file_name in glob.glob(join(get_fixture_dir(dir_name), glob_pattern))] - def initialize_options(self): - self._dir = os.getcwd() - def finalize_options(self): +def remove_output_file(dir_name, file_name): + try: + os.remove(get_fixture(dir_name, file_name)) + except: pass - - def run(self): - ''' - Finds all the tests modules in tests/, and runs them. - ''' - testfiles = [] - for t in glob(join(self._dir, 'tests', '*.py')): - module_name = splitext(basename(t))[0] - if module_name.startswith('test'): - testfiles.append('.'.join(['tests', module_name]) - ) - - tests = TestLoader().loadTestsFromNames(testfiles) - t = TextTestRunner(verbosity=2) - t.run(tests) - -- 2.20.1