MANIFEST
dist
build
-nbproject
-nbproject/*
+*.egg-info
+.coverage
-include librarian/*.xslt
-recursive-include tests/files/ *.xml
+include librarian/*.xslt
+include librarian/config.xml
--- /dev/null
+#!python
+"""Bootstrap setuptools installation
+
+If you want to use setuptools in your package's setup.py, just include this
+file in the same directory with it, and add this to the top of your setup.py::
+
+ from ez_setup import use_setuptools
+ use_setuptools()
+
+If you want to require a specific version of setuptools, set a download
+mirror, or use an alternate download directory, you can do so by supplying
+the appropriate options to ``use_setuptools()``.
+
+This file can also be run as a script to install or upgrade setuptools.
+"""
+import sys
+DEFAULT_VERSION = "0.6c9"
+DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3]
+
+md5_data = {
+ 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca',
+ 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb',
+ 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b',
+ 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a',
+ 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618',
+ 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac',
+ 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5',
+ 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4',
+ 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c',
+ 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b',
+ 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27',
+ 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277',
+ 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa',
+ 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e',
+ 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e',
+ 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f',
+ 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2',
+ 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc',
+ 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167',
+ 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64',
+ 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d',
+ 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20',
+ 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab',
+ 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53',
+ 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2',
+ 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e',
+ 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372',
+ 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902',
+ 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de',
+ 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b',
+ 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03',
+ 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a',
+ 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6',
+ 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a',
+}
+
+import sys, os
+try: from hashlib import md5
+except ImportError: from md5 import md5
+
+def _validate_md5(egg_name, data):
+ if egg_name in md5_data:
+ digest = md5(data).hexdigest()
+ if digest != md5_data[egg_name]:
+ print >>sys.stderr, (
+ "md5 validation of %s failed! (Possible download problem?)"
+ % egg_name
+ )
+ sys.exit(2)
+ return data
+
+def use_setuptools(
+ version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
+ download_delay=15
+):
+ """Automatically find/download setuptools and make it available on sys.path
+
+ `version` should be a valid setuptools version number that is available
+ as an egg for download under the `download_base` URL (which should end with
+ a '/'). `to_dir` is the directory where setuptools will be downloaded, if
+ it is not already available. If `download_delay` is specified, it should
+ be the number of seconds that will be paused before initiating a download,
+ should one be required. If an older version of setuptools is installed,
+ this routine will print a message to ``sys.stderr`` and raise SystemExit in
+ an attempt to abort the calling script.
+ """
+ was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules
+ def do_download():
+ egg = download_setuptools(version, download_base, to_dir, download_delay)
+ sys.path.insert(0, egg)
+ import setuptools; setuptools.bootstrap_install_from = egg
+ try:
+ import pkg_resources
+ except ImportError:
+ return do_download()
+ try:
+ pkg_resources.require("setuptools>="+version); return
+ except pkg_resources.VersionConflict, e:
+ if was_imported:
+ print >>sys.stderr, (
+ "The required version of setuptools (>=%s) is not available, and\n"
+ "can't be installed while this script is running. Please install\n"
+ " a more recent version first, using 'easy_install -U setuptools'."
+ "\n\n(Currently using %r)"
+ ) % (version, e.args[0])
+ sys.exit(2)
+ else:
+ del pkg_resources, sys.modules['pkg_resources'] # reload ok
+ return do_download()
+ except pkg_resources.DistributionNotFound:
+ return do_download()
+
+def download_setuptools(
+ version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
+ delay = 15
+):
+ """Download setuptools from a specified location and return its filename
+
+ `version` should be a valid setuptools version number that is available
+ as an egg for download under the `download_base` URL (which should end
+ with a '/'). `to_dir` is the directory where the egg will be downloaded.
+ `delay` is the number of seconds to pause before an actual download attempt.
+ """
+ import urllib2, shutil
+ egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3])
+ url = download_base + egg_name
+ saveto = os.path.join(to_dir, egg_name)
+ src = dst = None
+ if not os.path.exists(saveto): # Avoid repeated downloads
+ try:
+ from distutils import log
+ if delay:
+ log.warn("""
+---------------------------------------------------------------------------
+This script requires setuptools version %s to run (even to display
+help). I will attempt to download it for you (from
+%s), but
+you may need to enable firewall access for this script first.
+I will start the download in %d seconds.
+
+(Note: if this machine does not have network access, please obtain the file
+
+ %s
+
+and place it in this directory before rerunning this script.)
+---------------------------------------------------------------------------""",
+ version, download_base, delay, url
+ ); from time import sleep; sleep(delay)
+ log.warn("Downloading %s", url)
+ src = urllib2.urlopen(url)
+ # Read/write all in one block, so we don't create a corrupt file
+ # if the download is interrupted.
+ data = _validate_md5(egg_name, src.read())
+ dst = open(saveto,"wb"); dst.write(data)
+ finally:
+ if src: src.close()
+ if dst: dst.close()
+ return os.path.realpath(saveto)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def main(argv, version=DEFAULT_VERSION):
+ """Install or upgrade setuptools and EasyInstall"""
+ try:
+ import setuptools
+ except ImportError:
+ egg = None
+ try:
+ egg = download_setuptools(version, delay=0)
+ sys.path.insert(0,egg)
+ from setuptools.command.easy_install import main
+ return main(list(argv)+[egg]) # we're done here
+ finally:
+ if egg and os.path.exists(egg):
+ os.unlink(egg)
+ else:
+ if setuptools.__version__ == '0.0.1':
+ print >>sys.stderr, (
+ "You have an obsolete version of setuptools installed. Please\n"
+ "remove it from your system entirely before rerunning this script."
+ )
+ sys.exit(2)
+
+ req = "setuptools>="+version
+ import pkg_resources
+ try:
+ pkg_resources.require(req)
+ except pkg_resources.VersionConflict:
+ try:
+ from setuptools.command.easy_install import main
+ except ImportError:
+ from easy_install import main
+ main(list(argv)+[download_setuptools(delay=0)])
+ sys.exit(0) # try to force an exit
+ else:
+ if argv:
+ from setuptools.command.easy_install import main
+ main(argv)
+ else:
+ print "Setuptools version",version,"or greater has been installed."
+ print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)'
+
+def update_md5(filenames):
+ """Update our built-in md5 registry"""
+
+ import re
+
+ for name in filenames:
+ base = os.path.basename(name)
+ f = open(name,'rb')
+ md5_data[base] = md5(f.read()).hexdigest()
+ f.close()
+
+ data = [" %r: %r,\n" % it for it in md5_data.items()]
+ data.sort()
+ repl = "".join(data)
+
+ import inspect
+ srcfile = inspect.getsourcefile(sys.modules[__name__])
+ f = open(srcfile, 'rb'); src = f.read(); f.close()
+
+ match = re.search("\nmd5_data = {\n([^}]+)}", src)
+ if not match:
+ print >>sys.stderr, "Internal error!"
+ sys.exit(2)
+
+ src = src[:match.start(1)] + repl + src[match.end(1):]
+ f = open(srcfile,'w')
+ f.write(src)
+ f.close()
+
+
+if __name__=='__main__':
+ if len(sys.argv)>2 and sys.argv[1]=='--md5update':
+ update_md5(sys.argv[2:])
+ else:
+ main(sys.argv[1:])
+
+
+
+
+
# exception classes
class ParseError(Exception):
- pass
+
+ def __init__(self, cause, message=None):
+ self.cause = cause
+ try:
+ self.message = message or self.cause.message
+ except:
+ self.message = "No message."
class ValidationError(Exception):
pass
class NoDublinCore(ValidationError):
pass
+
+class XMLNamespace(object):
+ '''A handy structure to repsent names in an XML namespace.'''
+
+ def __init__(self, uri):
+ self.uri = uri
+
+ def __call__(self, tag):
+ return '{%s}%s' % (self.uri, tag)
+
+ def __contains__(self, tag):
+ return tag.startswith('{'+str(self)+'}')
+
+ def __repr__(self):
+ return 'XMLNamespace(%r)' % self.uri
+
+ def __str__(self):
+ return '%s' % self.uri
+
+class EmptyNamespace(XMLNamespace):
+ def __init__(self):
+ super(EmptyNamespace, self).__init__('')
+
+ def __call__(self, tag):
+ return tag
+
+# some common namespaces we use
+RDFNS = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
+DCNS = XMLNamespace('http://purl.org/dc/elements/1.1/')
+XINS = XMLNamespace("http://www.w3.org/2001/XInclude")
+XHTMLNS = XMLNamespace("http://www.w3.org/1999/xhtml")
+
+WLNS = EmptyNamespace()
+
+import lxml.etree as etree
+import dcparser
+
+DEFAULT_BOOKINFO = dcparser.BookInfo(
+ { RDFNS('about'): u'http://wiki.wolnepodreczniki.pl/Lektury:Template'},\
+ { DCNS('creator'): [u'Some, Author'],
+ DCNS('title'): [u'Some Title'],
+ DCNS('subject.period'): [u'Unknown'],
+ DCNS('subject.type'): [u'Unknown'],
+ DCNS('subject.genre'): [u'Unknown'],
+ DCNS('date'): ['1970-01-01'],
+ # DCNS('date'): [creation_date],
+ DCNS('publisher'): [u"Fundacja Nowoczesna Polska"],
+ DCNS('description'):
+ [u"""Publikacja zrealizowana w ramach projektu
+ Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa
+ wykonana przez Bibliotekę Narodową z egzemplarza
+ pochodzącego ze zbiorów BN."""],
+ DCNS('identifier.url'):
+ [u"http://wolnelektury.pl/katalog/lektura/template"],
+ DCNS('rights'):
+ [u"Domena publiczna - zm. [OPIS STANU PRAWNEGO TEKSTU]"] })
+
+def xinclude_forURI(uri):
+ e = etree.Element( XINS("include") )
+ e.set("href", uri)
+ return etree.tostring(e, encoding=unicode)
+
+def wrap_text(ocrtext, creation_date, bookinfo=DEFAULT_BOOKINFO):
+ """Wrap the text within the minimal XML structure with a DC template."""
+ bookinfo.created_at = creation_date
+
+ dcstring = etree.tostring(bookinfo.to_etree(),\
+ method='xml', encoding=unicode, pretty_print=True)
+
+ return u'<utwor>\n' + dcstring + u'\n<plain-text>\n' + ocrtext +\
+ u'\n</plain-text>\n</utwor>';
\ No newline at end of file
<!-- ========================================== -->
<!-- Title page -->
<xsl:template match="autor_utworu" mode="header">
- <span class="author"><xsl:apply-templates mode="inline" /></span>
+ <span class="author editable"><xsl:apply-templates mode="inline" /></span>
</xsl:template>
<xsl:template match="nazwa_utworu" mode="header">
- <span class="title"><xsl:apply-templates mode="inline" /></span>
+ <span class="title editable"><xsl:apply-templates mode="inline" /></span>
</xsl:template>
<xsl:template match="dzielo_nadrzedne" mode="header">
</xsl:template>
<xsl:template match="akap|akap_dialog|akap_cd">
- <p class="paragraph"><xsl:apply-templates mode="inline" /></p>
+ <p class="paragraph editable"><xsl:apply-templates mode="inline" /></p>
</xsl:template>
<xsl:template match="strofa">
- <div class="stanza">
+ <div class="stanza editable">
<xsl:choose>
<xsl:when test="count(br) > 0">
<xsl:call-template name="verse">
</xsl:template>
-</xsl:stylesheet>
-
+</xsl:stylesheet>
\ No newline at end of file
--- /dev/null
+<config>
+ <block-elements>
+ <!-- tagi głowne -->
+ <utwor />
+ <opowiadanie />
+ <liryka_l />
+ <liryka_lp />
+ <powiesc />
+ <dramat_wierszowany_l />
+ <dramat_wierszowany_lp />
+ <dramat_wspolczesny />
+
+ <!-- inne tagi -->
+ <nota />
+ <dedykacja />
+ <kwestia />
+ <motto />
+ <didaskalia />
+ </block-elements>
+
+ <inline-elements>
+ <!-- with emphasis -->
+ <mat />
+ <didask_tekst />
+ <slowo_obce />
+ <wyroznienie />
+ <osoba />
+ <tytul_dziela />
+ </inline-elements>
+
+ <paragraph-elements>
+ <!-- akapity -->
+ <akap />
+ <akap_cd />
+ <akap_dialog />
+ <miejsce_czas />
+ <motto_podpis />
+ <wers_cd />
+ <wers_akap />
+ <wers_wciety />
+ </paragraph-elements>
+
+ <header-1-elements>
+ <!-- placeholder -->
+ <dzielo_nadrzedne />
+ </header-1-elements>
+
+ <header-2-elements>
+ <naglowek_akt />
+ <naglowek_czesc />
+ <autor_utworu />
+ <nazwa_utworu />
+ <srodtytul />
+ </header-2-elements>
+
+ <header-3-elements>
+ <naglowek_scena />
+ <naglowek_rozdzial />
+ <podtytul />
+ </header-3-elements>
+
+ <header-4-elements>
+ <naglowek_osoba />
+ <naglowek_podrozdzial />
+ </header-4-elements>
+
+ <special-tags>
+ <strofa />
+ <lista_osob />
+ <lista_osoba />
+ <sekcja_swiatlo />
+ <sekcja_asterysk />
+ <separator_linia />
+ <zastepnik_wersu />
+ <dlugi_cytat />
+ </special-tags>
+
+ <annotations>
+ <pa />
+ <pe />
+ <pr />
+ <pt />
+ </annotations>
+
+ <no-show-elements>
+ <begin />
+ <end />
+ <extra />
+ <uwaga />
+ <motyw />
+ <br />
+ <pa />
+ <pe />
+ <pr />
+ <pt />
+ </no-show-elements>
+
+ <editable>
+ <strofa />
+ <akap />
+ <akap_cd />
+ <akap_dialog />
+ <dzielo_nadrzedne />
+
+ <naglowek_akt />
+ <naglowek_czesc />
+ <autor_utworu />
+ <nazwa_utworu />
+ <srodtytul />
+
+ <naglowek_scena />
+ <naglowek_rozdzial />
+ <podtytul />
+
+ <naglowek_osoba />
+ <naglowek_podrozdzial />
+
+ <lista_osoba />
+
+ <dlugi_cytat />
+ <poezja_cyt />
+
+ <didaskalia />
+ </editable>
+</config>
\ No newline at end of file
from datetime import date
import time
-from librarian import ValidationError, NoDublinCore
+from librarian import ValidationError, NoDublinCore, ParseError, DCNS, RDFNS
import lxml.etree as etree # ElementTree API using libxml2
from lxml.etree import XMLSyntaxError
return self.validate_value(f)
-# ==========
-# = Parser =
-# ==========
-class XMLNamespace(object):
- '''Represents XML namespace.'''
-
- def __init__(self, uri):
- self.uri = uri
-
- def __call__(self, tag):
- return '{%s}%s' % (self.uri, tag)
- def __contains__(self, tag):
- return tag.startswith(str(self))
- def __repr__(self):
- return 'XMLNamespace(%r)' % self.uri
-
- def __str__(self):
- return '%s' % self.uri
-
-
-class BookInfo(object):
- RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
- DC = XMLNamespace('http://purl.org/dc/elements/1.1/')
-
+class BookInfo(object):
FIELDS = (
- Field( DC('creator'), 'author', as_person),
- Field( DC('title'), 'title'),
- Field( DC('subject.period'), 'epoches', salias='epoch', multiple=True),
- Field( DC('subject.type'), 'kinds', salias='kind', multiple=True),
- Field( DC('subject.genre'), 'genres', salias='genre', multiple=True),
- Field( DC('date'), 'created_at', as_date),
- Field( DC('date.pd'), 'released_to_public_domain_at', as_date, required=False),
- Field( DC('contributor.editor'), 'editors', \
+ Field( DCNS('creator'), 'author', as_person),
+ Field( DCNS('title'), 'title'),
+ Field( DCNS('subject.period'), 'epochs', salias='epoch', multiple=True),
+ Field( DCNS('subject.type'), 'kinds', salias='kind', multiple=True),
+ Field( DCNS('subject.genre'), 'genres', salias='genre', multiple=True),
+ Field( DCNS('date'), 'created_at', as_date),
+ Field( DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False),
+ Field( DCNS('contributor.editor'), 'editors', \
as_person, salias='editor', multiple=True, default=[]),
- Field( DC('contributor.translator'), 'translators', \
+ Field( DCNS('contributor.translator'), 'translators', \
as_person, salias='translator', multiple=True, default=[]),
- Field( DC('contributor.technical_editor'), 'technical_editors',
+ Field( DCNS('contributor.technical_editor'), 'technical_editors',
as_person, salias='technical_editor', multiple=True, default=[]),
- Field( DC('publisher'), 'publisher'),
- Field( DC('source'), 'source_name', required=False),
- Field( DC('source.URL'), 'source_url', required=False),
- Field( DC('identifier.url'), 'url'),
- Field( DC('relation.hasPart'), 'parts', multiple=True, required=False),
- Field( DC('rights.license'), 'license', required=False),
- Field( DC('rights'), 'license_description'),
+ Field( DCNS('publisher'), 'publisher'),
+ Field( DCNS('source'), 'source_name', required=False),
+ Field( DCNS('source.URL'), 'source_url', required=False),
+ Field( DCNS('identifier.url'), 'url'),
+ Field( DCNS('relation.hasPart'), 'parts', multiple=True, required=False),
+ Field( DCNS('rights.license'), 'license', required=False),
+ Field( DCNS('rights'), 'license_description'),
)
@classmethod
try:
iter = etree.iterparse(xmlfile, ['start', 'end'])
for (event, element) in iter:
- if element.tag == cls.RDF('RDF') and event == 'start':
+ if element.tag == RDFNS('RDF') and event == 'start':
desc_tag = element
break
# continue 'till the end of RDF section
for (event, element) in iter:
- if element.tag == cls.RDF('RDF') and event == 'end':
+ if element.tag == RDFNS('RDF') and event == 'end':
break
# if there is no end, Expat should yell at us with an ExpatError
def from_element(cls, rdf_tag):
# the tree is already parsed, so we don't need to worry about Expat errors
field_dict = {}
- desc = rdf_tag.find(".//" + cls.RDF('Description') )
+ desc = rdf_tag.find(".//" + RDFNS('Description') )
if desc is None:
raise NoDublinCore("No DublinCore section found.")
fv.append(e.text)
field_dict[e.tag] = fv
- return cls( desc.attrib, field_dict )
+ return cls( desc.attrib, field_dict )
def __init__(self, rdf_attrs, dc_fields):
"""rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description.
dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the
given field. """
- self.about = rdf_attrs.get(self.RDF('about'))
+ self.about = rdf_attrs.get(RDFNS('about'))
self.fmap = {}
for field in self.FIELDS:
#etree._namespace_map[str(self.DC)] = 'dc'
if parent is None:
- root = etree.Element(self.RDF('RDF'))
+ root = etree.Element(RDFNS('RDF'))
else:
- root = parent.makeelement(self.RDF('RDF'))
+ root = parent.makeelement(RDFNS('RDF'))
- description = etree.SubElement(root, self.RDF('Description'))
+ description = etree.SubElement(root, RDFNS('Description'))
if self.about:
- description.set(self.RDF('about'), self.about)
+ description.set(RDFNS('about'), self.about)
for field in self.FIELDS:
v = getattr(self, field.name, None)
return root
+
+ def serialize(self):
+ rdf = {}
+ rdf['about'] = { 'uri': RDFNS('about'), 'value': self.about }
+
+ dc = {}
+ for field in self.FIELDS:
+ v = getattr(self, field.name, None)
+ if v is not None:
+ if field.multiple:
+ if len(v) == 0: continue
+ v = [ unicode(x) for x in v if v is not None ]
+ else:
+ v = unicode(v)
+
+ dc[field.name] = {'uri': field.uri, 'value': v}
+ rdf['fields'] = dc
+ return rdf
+
def to_dict(self):
result = {'about': self.about}
for field in self.FIELDS:
if v is not None:
if field.multiple:
if len(v) == 0: continue
- v = [ unicode(x) for x in v ]
+ v = [ unicode(x) for x in v if v is not None ]
else:
v = unicode(v)
result[field.name] = v
if field.salias:
v = getattr(self, field.salias)
- if v is not None: result[field.salias] = v
+ if v is not None: result[field.salias] = unicode(v)
return result
from lxml import etree
from librarian.parser import WLDocument
+from librarian import XHTMLNS, ParseError
+from lxml.etree import XMLSyntaxError, XSLTApplyError
ENTITY_SUBSTITUTIONS = [
(u'---', u'—'),
(u'"', u'”'),
]
+STYLESHEETS = {
+ 'legacy': 'book2html.xslt',
+ 'full': 'wl2html_full.xslt',
+ 'partial': 'wl2html_partial.xslt'
+}
+
+def get_stylesheet(name):
+ return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
def substitute_entities(context, text):
"""XPath extension function converting all entites in passed text."""
text = text.replace(entity, substitutution)
return text
-
# Register substitute_entities function with lxml
ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
ns['substitute_entities'] = substitute_entities
-
-def transform(input, output_filename=None, is_file=True):
+def transform(input, output_filename=None, is_file=True, \
+ parse_dublincore=True, stylesheet='legacy', options={}):
"""Transforms file input_filename in XML to output_filename in XHTML."""
# Parse XSLT
- style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt')
- style = etree.parse(style_filename)
+ try:
+ style_filename = get_stylesheet(stylesheet)
+ style = etree.parse(style_filename)
- if is_file:
- document = WLDocument.from_file(input, True)
- else:
- document = WLDocument.from_string(input, True)
-
- result = document.transform(style)
- del document # no longer needed large object :)
+ if is_file:
+ document = WLDocument.from_file(input, True, \
+ parse_dublincore=parse_dublincore)
+ else:
+ document = WLDocument.from_string(input, True, \
+ parse_dublincore=parse_dublincore)
- if result.find('//p') is not None:
- add_anchors(result.getroot())
- add_table_of_contents(result.getroot())
+ result = document.transform(style, **options)
+ del document # no longer needed large object :)
- if output_filename is not None:
- result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
+ if etree.ETXPath('//p|//{%s}p' % str(XHTMLNS))(result) is not None:
+ add_anchors(result.getroot())
+ add_table_of_contents(result.getroot())
+
+ if output_filename is not None:
+ result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
+ else:
+ return result
+ return True
else:
- return result
- return True
- else:
- return False
-
+ print "[Librarian] didn't find any paragraphs"
+ return "<empty />"
+ except KeyError:
+ raise ValueError("'%s' is not a valid stylesheet.")
+ except (XMLSyntaxError, XSLTApplyError), e:
+ raise ParseError(e)
class Fragment(object):
def __init__(self, id, themes):
# -*- coding: utf-8 -*-
-from librarian import ValidationError, NoDublinCore, dcparser, ParseError
+from librarian import ValidationError, NoDublinCore, ParseError
+from librarian import RDFNS, DCNS
+from librarian import dcparser
+
from xml.parsers.expat import ExpatError
from lxml import etree
-from lxml.etree import XMLSyntaxError
+from lxml.etree import XMLSyntaxError, XSLTApplyError
import re
from StringIO import StringIO
class WLDocument(object):
LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
- def __init__(self, edoc):
+ def __init__(self, edoc, parse_dublincore=True):
self.edoc = edoc
root_elem = edoc.getroot()
- rdf_ns = dcparser.BookInfo.RDF
- dc_path = './/' + rdf_ns('RDF')
+
+ dc_path = './/' + RDFNS('RDF')
if root_elem.tag != 'utwor':
raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
- self.rdf_elem = root_elem.find(dc_path)
-
- if self.rdf_elem is None:
- raise NoDublinCore('Document has no DublinCore - which is required.')
-
- self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
+ if parse_dublincore:
+ self.rdf_elem = root_elem.find(dc_path)
+ if self.rdf_elem is None:
+ raise NoDublinCore('Document has no DublinCore - which is required.')
+
+ self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
+ else:
+ self.book_info = None
+
@classmethod
- def from_string(cls, xml, swap_endlines=False):
- return cls.from_file(StringIO(xml), swap_endlines)
+ def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
+ return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
@classmethod
- def from_file(cls, xmlfile, swap_endlines=False):
+ def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
# first, prepare for parsing
if isinstance(xmlfile, basestring):
try:
parser = etree.XMLParser(remove_blank_text=True)
- return cls( etree.parse(StringIO(data), parser) )
- except XMLSyntaxError, e:
- raise ParseError(e.message)
- except ExpatError, e:
- raise ParseError(e.message)
+ return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
+ except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
+ raise ParseError(e)
+
+ def part_as_text(self, path):
+ # convert the path to XPath
+ print "[L] Retrieving part:", path
+
+ elems = self.edoc.xpath(self.path_to_xpath(path))
+ print "[L] xpath", elems
+
+ if len(elems) == 0:
+ return None
+
+ return etree.tostring(elems[0], encoding=unicode, pretty_print=True)
+
+
+ def path_to_xpath(self, path):
+ parts = []
+
+ for part in path.split('/'):
+ match = re.match(r'([^\[]+)\[(\d+)\]', part)
+ if not match:
+ parts.append(part)
+ else:
+ tag, n = match.groups()
+ parts.append("node()[position() = %d and name() = '%s']" % (int(n), tag) )
+
+ if parts[0] == '.':
+ parts[0] = ''
+
+ return '/'.join(parts)
def transform(self, stylesheet, **options):
return self.edoc.xslt(stylesheet, **options)
def serialize(self):
self.update_dc()
return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
+
+ def merge_chunks(self, chunk_dict):
+ unmerged = []
+
+ for key, data in chunk_dict.iteritems():
+ try:
+ xpath = self.path_to_xpath(key)
+ node = self.edoc.xpath(xpath)[0]
+ repl = etree.fromstring(data)
+ node.getparent().replace(node, repl);
+ except Exception, e:
+ unmerged.append( repr( (key, xpath, e) ) )
+
+ return unmerged
\ No newline at end of file
# -*- coding: utf-8 -*-
-import os
+from librarian import dcparser, parser
+from lxml import etree
import cStringIO
-import re
import codecs
-
-from lxml import etree
-
-from librarian import dcparser
+import os
+import re
ENTITY_SUBSTITUTIONS = [
ns['wrap_words'] = wrap_words
-def transform(input_filename, output_filename, **options):
+def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options):
"""Transforms file input_filename in XML to output_filename in TXT."""
# Parse XSLT
style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt')
style = etree.parse(style_filename)
if is_file:
- document = WLDocument.from_file(input, True)
+ document = parser.WLDocument.from_file(input_filename, True, parse_dublincore=parse_dublincore)
else:
- document = WLDocument.from_string(input, True)
+ document = parser.WLDocument.from_string(input_filename, True, parse_dublincore=parse_dublincore)
result = document.transform(style, **options)
output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
+
+ if parse_dublincore:
+ url = dcparser.parse(input_filename).url
+ else:
+ url = '*' * 10
output_file.write(TEMPLATE % {
- 'url': dcparser.parse(input_filename).url,
+ 'url': url,
'text': unicode(result),
})
--- /dev/null
+
+<xsl:stylesheet
+ version="1.0"
+
+ xmlns="http://www.w3.org/1999/xhtml"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:wl2o="http://nowoczesnapolska.org.pl/WL/2.0/Overlay"
+ xmlns:wl="http://wolnelektury.pl/functions"
+
+ exclude-result-prefixes="wl" >
+
+ <xsl:variable name="config" select="document('config.xml')" />
+
+ <xsl:output method="xml"
+ encoding="utf-8"
+ indent="yes"
+ omit-xml-declaration = "yes" />
+
+ <xsl:strip-space elements = "strofa utwor kwestia liryka_l liryka_lp powiesc opowiadanie dramat_wierszowany_lp" />
+ <!--
+ Dokument ten opisuje podstawowe przekształcenia potrzebne
+ do zamiany dokumentu WLML 1.0 na poprawnie sformatowany
+ dokument XHMTL.
+
+ -->
+
+ <xsl:template name="generic-attributes">
+ <xsl:param name="element" />
+ <xsl:param name="mypath" />
+ <xsl:variable name="tag" select="name($element)" />
+
+ <xsl:if test="$with-paths">
+ <xsl:attribute name="wl2o:path">
+ <xsl:value-of select="$mypath" />
+ </xsl:attribute>
+ </xsl:if>
+
+ <xsl:if test="$config//editable/*[name() = $tag]">
+ <xsl:attribute name="wl2o:editable">editable</xsl:attribute>
+ </xsl:if>
+
+ <xsl:attribute name="class">
+ <xsl:value-of select="$tag"/>
+ </xsl:attribute>
+ </xsl:template>
+
+ <xsl:template name="generic-descent">
+ <xsl:param name="element" />
+ <xsl:param name="mypath" />
+
+ <xsl:for-each select="child::node()">
+ <xsl:apply-templates select="." mode="element-tag">
+ <xsl:with-param name="offset" select="position()" />
+ <xsl:with-param name="parent-path" select="$mypath" />
+ </xsl:apply-templates>
+ </xsl:for-each>
+ </xsl:template>
+
+ <xsl:template name="generic-content">
+ <xsl:param name="element" />
+ <xsl:param name="mypath" />
+
+ <xsl:call-template name="generic-attributes">
+ <xsl:with-param name="element" select="$element" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+
+ <xsl:call-template name="generic-descent">
+ <xsl:with-param name="element" select="$element" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:template>
+
+ <!-- Generyczne szablony -->
+ <xsl:template name="generic" >
+ <xsl:param name="element" />
+ <xsl:param name="mypath" />
+ <xsl:param name="offset" />
+
+ <!-- <xsl:param name="parent-type" select="'block'" /> -->
+
+ <xsl:variable name="tag" select="name($element)" />
+
+ <xsl:choose>
+ <!-- ignore namespaced elements -->
+ <xsl:when test="namespace-uri()" />
+
+ <xsl:when test="$config//block-elements/*[local-name() = $tag]">
+ <xsl:element name="div" namespace="http://www.w3.org/1999/xhtml">
+ <xsl:apply-templates select="$element" mode="element-content" >
+ <xsl:with-param name="mypath" select="$mypath"/>
+ </xsl:apply-templates>
+ </xsl:element>
+ </xsl:when>
+
+ <xsl:when test="$config//paragraph-elements/*[local-name() = $tag]">
+ <xsl:element name="p" namespace="http://www.w3.org/1999/xhtml">
+ <xsl:apply-templates select="$element" mode="element-content" >
+ <xsl:with-param name="mypath" select="$mypath"/>
+ </xsl:apply-templates>
+ </xsl:element>
+ </xsl:when>
+
+ <xsl:when test="$config//inline-elements/*[local-name() = $tag]">
+ <xsl:element name="span" namespace="http://www.w3.org/1999/xhtml">
+ <xsl:apply-templates select="$element" mode="element-content" >
+ <xsl:with-param name="mypath" select="$mypath"/>
+ </xsl:apply-templates>
+ </xsl:element>
+ </xsl:when>
+
+ <xsl:when test="$config//header-1-elements/*[local-name() = $tag]">
+ <xsl:element name="h1" namespace="http://www.w3.org/1999/xhtml">
+ <xsl:apply-templates select="$element" mode="element-content" >
+ <xsl:with-param name="mypath" select="$mypath"/>
+ </xsl:apply-templates>
+ </xsl:element>
+ </xsl:when>
+
+ <xsl:when test="$config//header-2-elements/*[local-name() = $tag]">
+ <xsl:element name="h2" namespace="http://www.w3.org/1999/xhtml">
+ <xsl:apply-templates select="$element" mode="element-content" >
+ <xsl:with-param name="mypath" select="$mypath"/>
+ </xsl:apply-templates>
+ </xsl:element>
+ </xsl:when>
+
+ <xsl:when test="$config//header-3-elements/*[local-name() = $tag]">
+ <xsl:element name="h3" namespace="http://www.w3.org/1999/xhtml">
+ <xsl:apply-templates select="$element" mode="element-content" >
+ <xsl:with-param name="mypath" select="$mypath"/>
+ </xsl:apply-templates>
+ </xsl:element>
+ </xsl:when>
+
+ <xsl:when test="$config//header-4-elements/*[local-name() = $tag]">
+ <xsl:element name="h4" namespace="http://www.w3.org/1999/xhtml">
+ <xsl:apply-templates select="$element" mode="element-content" >
+ <xsl:with-param name="mypath" select="$mypath"/>
+ </xsl:apply-templates>
+ </xsl:element>
+ </xsl:when>
+
+ <xsl:when test="$config//no-show-elements/*[local-name() = $tag]" />
+
+ <xsl:otherwise>
+ <xsl:message terminate="yes">
+ Nieznany tag '<xsl:value-of select="$tag" />' :(.
+ </xsl:message>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+
+ <!--
+ <special-tags>
+ <strofa />
+ <lista_osob />
+ <sekcja_swiatlo />
+ <sekcja_asterysk />
+ <separator_linia />
+ </special-tags>
+ -->
+
+ <xsl:template match="dlugi_cytat|poezja_cyt" mode="element-tag">
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[', string($offset),']')" />
+
+ <xsl:element name="blockquote" >
+ <xsl:call-template name="generic-attributes">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ <xsl:call-template name="generic-descent">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:element>
+ </xsl:template>
+
+
+ <xsl:template match="lista_osob" mode="element-tag">
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[', string($offset),']')" />
+
+ <xsl:element name="div" >
+ <xsl:call-template name="generic-attributes">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+
+ <xsl:apply-templates select="./naglowek-listy" mode="element-tag" />
+ <ul>
+ <xsl:for-each select="./lista_osoba">
+ <xsl:apply-templates select="." mode="element-tag">
+ <xsl:with-param name="offset" select="position()" />
+ <xsl:with-param name="parent-path" select="$mypath" />
+ </xsl:apply-templates>
+ </xsl:for-each>
+ </ul>
+ </xsl:element>
+ </xsl:template>
+
+ <xsl:template match="lista_osoba" mode="element-tag">
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[', string($offset),']')" />
+
+ <xsl:element name="li" >
+ <xsl:call-template name="generic-attributes">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ <xsl:call-template name="generic-descent">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:element>
+ </xsl:template>
+
+ <xsl:template match="separator_linia" mode="element-tag">
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[', string($offset),']')" />
+
+ <xsl:element name="hr" >
+ <xsl:call-template name="generic-attributes">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:element>
+ </xsl:template>
+
+ <xsl:template match="sekcja_swiatlo" mode="element-tag">
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[', string($offset),']')" />
+
+ <xsl:element name="br" >
+ <xsl:call-template name="generic-attributes">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:element>
+ </xsl:template>
+
+ <xsl:template match="sekcja_asterysk" mode="element-tag">
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[', string($offset),']')" />
+
+ <xsl:element name="p" >
+ <xsl:call-template name="generic-attributes">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ *
+ </xsl:element>
+ </xsl:template>
+
+ <xsl:template match="zastepnik_wersu|wers_akap|wers_cd|wers_wciety" mode="element-tag">
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[',string($offset),']')" />
+
+ <xsl:call-template name="generic-descent">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:template>
+
+ <!-- strofy -->
+ <xsl:template match="strofa" mode="element-tag">
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[', string($offset),']')" />
+
+ <xsl:element name="div" >
+ <xsl:call-template name="generic-attributes">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+
+ <xsl:choose>
+ <xsl:when test="count(br) > 0">
+ <xsl:call-template name="verse">
+ <xsl:with-param name="verse-content" select="br[1]/preceding-sibling::text() | br[1]/preceding-sibling::node()" />
+ <xsl:with-param name="verse-type" select="br[1]/preceding-sibling::*[name() = 'wers_wciety' or name() = 'wers_akap' or name() = 'wers_cd'][1]" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ <xsl:for-each select="br">
+ <!-- Each BR tag "consumes" text after it -->
+ <xsl:variable name="lnum" select="count(preceding-sibling::br)" />
+ <xsl:call-template name="verse">
+ <xsl:with-param name="verse-content"
+ select="following-sibling::text()[count(preceding-sibling::br) = $lnum+1] | following-sibling::node()[count(preceding-sibling::br) = $lnum+1]" />
+ <xsl:with-param name="verse-type" select="following-sibling::*[count(preceding-sibling::br) = $lnum+1 and (name() = 'wers_wciety' or name() = 'wers_akap' or name() = 'wers_cd')][1]" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:for-each>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:call-template name="verse">
+ <xsl:with-param name="verse-content" select="child::node()" />
+ <xsl:with-param name="verse-type" select="wers_wciety|wers_akap|wers_cd[1]" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:element>
+ </xsl:template>
+
+ <xsl:template name="verse">
+ <xsl:param name="verse-content" />
+ <xsl:param name="verse-type" />
+ <xsl:param name="mypath" />
+
+ <xsl:element name="p">
+ <xsl:attribute name="class">
+ <xsl:value-of select="name($verse-type)" />
+ </xsl:attribute>
+ <xsl:for-each select="$verse-content">
+ <xsl:apply-templates select="." mode="element-tag">
+ <xsl:with-param name="offset" select="position()" />
+ <xsl:with-param name="parent-path" select="$mypath" />
+ </xsl:apply-templates>
+ </xsl:for-each>
+ </xsl:element>
+ </xsl:template>
+
+
+<!-- default content processing -->
+ <xsl:template match="*" mode="element-content">
+ <xsl:param name="mypath" />
+ <xsl:call-template name="generic-content">
+ <xsl:with-param name="element" select="current()"/>
+ <xsl:with-param name="mypath" select="$mypath"/>
+ </xsl:call-template>
+ </xsl:template>
+
+ <xsl:template match="*" mode="element-tag" >
+ <xsl:param name="offset" />
+ <xsl:param name="parent-path" />
+
+ <xsl:variable name="mypath"
+ select="concat($parent-path, '/', name(), '[', string($offset),']')" />
+
+ <xsl:call-template name="generic">
+ <xsl:with-param name="element" select="current()" />
+ <xsl:with-param name="offset" select="$offset" />
+ <xsl:with-param name="mypath" select="$mypath" />
+ </xsl:call-template>
+ </xsl:template>
+
+ <xsl:template match="text()" mode="element-tag">
+
+ <xsl:value-of select="wl:substitute_entities(.)" />
+
+ <!--<xsl:value-of select="." /> -->
+ </xsl:template>
+
+ <xsl:template match="node()" />
+
+</xsl:stylesheet>
--- /dev/null
+
+<xsl:stylesheet version="1.0"
+ xmlns="http://www.w3.org/1999/xhtml"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+ <xsl:param name="with-paths" select="boolean(0)" />
+ <xsl:param name="with-annotations" select="boolean(1)" />
+
+ <xsl:include href="wl2html_base.xslt" />
+ <xsl:output encoding="utf-8" indent="yes" omit-xml-declaration = "yes" />
+
+ <xsl:template match="/">
+ <div class="document">
+
+ <xsl:if test="with-toc" />
+
+ <xsl:call-template name="generic">
+ <xsl:with-param name="element" select="/utwor" />
+ <xsl:with-param name="mypath" select="'.'" />
+ <xsl:with-param name="offset" select="position()" />
+ </xsl:call-template>
+
+ <xsl:if test="with-annotations" />
+ </div>
+ </xsl:template>
+
+</xsl:stylesheet>
\ No newline at end of file
--- /dev/null
+<xsl:stylesheet version="1.0"
+ xmlns="http://www.w3.org/1999/xhtml"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+ <xsl:param name="with-paths" select="boolean(0)" />
+ <xsl:param name="base-path" select="'.'"/>
+ <xsl:param name="base-offset" select="1" />
+
+ <xsl:include href="wl2html_base.xslt" />
+ <xsl:output encoding="utf-8" indent="yes" omit-xml-declaration = "yes" />
+
+ <xsl:template match="/">
+ <xsl:message>Processing...</xsl:message>
+ <xsl:apply-templates select="/*" mode="element-tag">
+ <xsl:with-param name="offset" select="$base-offset" />
+ <xsl:with-param name="parent-path" select="$base-path" />
+ </xsl:apply-templates>
+ </xsl:template>
+
+</xsl:stylesheet>
\ No newline at end of file
parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
help='print status messages to stdout')
-
+ parser.add_option('-i', '--ignore-dublin-core', action='store_false', dest='parse_dublincore', default=True,
+ help='don\'t try to parse dublin core metadata')
+
options, input_filenames = parser.parse_args()
if len(input_filenames) < 1:
output_filename = os.path.splitext(input_filename)[0] + '.html'
try:
- html.transform(input_filename, output_filename)
+ html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore)
except ParseError, e:
print '%(file)s:%(name)s:%(message)s' % {
'file': input_filename,
--- /dev/null
+#!/usr/bin/env python
+import os
+import optparse
+
+from librarian import html, ParseError
+
+
+if __name__ == '__main__':
+ # Parse commandline arguments
+ usage = """Usage: %prog [options] SOURCE [SOURCE...]
+ Convert SOURCE files to HTML format."""
+
+ parser = optparse.OptionParser(usage=usage)
+
+ parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
+ help='print status messages to stdout')
+ parser.add_option('-i', '--ignore-dublin-core', action='store_false', dest='parse_dublincore', default=True,
+ help='don\'t try to parse dublin core metadata')
+
+ options, input_filenames = parser.parse_args()
+
+ if len(input_filenames) < 1:
+ parser.print_help()
+ exit(1)
+
+ # Do some real work
+ for input_filename in input_filenames:
+ if options.verbose:
+ print input_filename
+
+ output_filename = os.path.splitext(input_filename)[0] + '.html'
+ try:
+ html.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore,\
+ stylesheet='partial')
+ except ParseError, e:
+ print '%(file)s:%(name)s:%(message)s' % {
+ 'file': input_filename,
+ 'name': e.__class__.__name__,
+ 'message': e.message.encode('utf-8')
+ }
+ except IOError, e:
+ print '%(file)s:%(name)s:%(message)s' % {
+ 'file': input_filename,
+ 'name': e.__class__.__name__,
+ 'message': e.strerror,
+ }
+ except BaseException, e:
+ print '%(file)s:%(etype)s:%(message)s' % {
+ 'file': input_filename,
+ 'etype': e.__class__.__name__,
+ 'message': e.message.encode('utf-8'),
+ }
+ raise e
+
help='print status messages to stdout')
parser.add_option('-w', '--wrap', action='store', type='int', dest='wrapping', default=0,
help='set line wrap column')
-
+ parser.add_option('-i', '--ignore-dublin-core', action='store_false', dest='parse_dublincore', default=True,
+ help='don\'t try to parse dublin core metadata')
+
options, input_filenames = parser.parse_args()
if len(input_filenames) < 1:
output_filename = os.path.splitext(input_filename)[0] + '.txt'
try:
- text.transform(input_filename, output_filename, wrapping=str(options.wrapping))
+ text.transform(input_filename, output_filename, parse_dublincore=options.parse_dublincore,
+ wrapping=str(options.wrapping))
except ParseError, e:
print '%(file)s:%(name)s:%(message)s' % {
'file': input_filename,
--- /dev/null
+[aliases]
+test = nosetests --detailed-errors --with-doctest --with-coverage --cover-package=librarian
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-from distutils.core import setup
-from tests.utils import TestCommand
+from ez_setup import use_setuptools
+use_setuptools()
+
+from setuptools import setup, find_packages
+
setup(
name='librarian',
- version='1.2.1',
+ version='1.2.5',
description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
author='Marek Stępniowski',
author_email='marek@stepniowski.com',
url='http://redmine.nowoczesnapolska.org.pl/',
- packages=['librarian', 'tests'],
- package_dir={'librarian': 'librarian', 'tests': 'tests'},
- package_data={
- 'librarian': ['*.xslt'],
- 'tests': ['files/dcparser/*.xml', 'files/erroneous/*.xml'],
- },
+ packages=find_packages(exclude=['tests']),
+ include_package_data=True,
+ install_requires=['lxml>=2.2'],
scripts=['scripts/book2html', 'scripts/book2txt', 'scripts/bookfragments', 'scripts/genslugs'],
- cmdclass={'test': TestCommand},
+ tests_require=['nose>=0.11', 'coverage>=3.0.1'],
)
<?xml version='1.0' encoding='utf-8'?>
-<utwor><liryka_lp>
- <autor_utworu>Adam Asnyk</autor_utworu>
- <nazwa_utworu>Między nami nic nie było</nazwa_utworu>
-
- <strofa>Między nami nic nie było!/
- Żadnych zwierzeń, wyznań żadnych!/
- Nic nas z sobą nie łączyło ---/
- Prócz wiosennych marzeń zdradnych;</strofa>
-
- <strofa>Prócz tych woni, barw i blasków,/
- Unoszących się w przestrzeni;/
- Prócz szumiących śpiewem lasków/
- I tej świeżej łąk zieleni;</strofa>
-
- <strofa>Prócz tych kaskad i potoków,/
- Zraszających każdy parów,/
- Prócz girlandy tęcz, obłoków,/
- Prócz natury słodkich czarów;</strofa>
-
- <strofa>Prócz tych wspólnych, jasnych zdrojów,/
- Z których serce zachwyt piło;/
- Prócz pierwiosnków i powojów,---/
- Między nami nic nie było!</strofa>
-</liryka_lp></utwor>
+<utwor>
+ <liryka_lp>
+
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<rdf:Description rdf:about="http://wiki.wolnepodreczniki.pl/Lektury:Asnyk/Między_nami_nic_nie_było">
+<dc:creator xml:lang="pl">Asnyk, Adam</dc:creator>
+<dc:title xml:lang="pl">Między nami nic nie było</dc:title>
+<dc:contributor.editor xml:lang="pl">Sekuła, Aleksandra</dc:contributor.editor>
+<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor>
+<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
+<dc:subject.period xml:lang="pl">Pozytywizm</dc:subject.period>
+<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
+<dc:subject.genre xml:lang="pl">Wiersz</dc:subject.genre>
+<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
+<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo</dc:identifier.url>
+<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/5164</dc:source.URL>
+<dc:source xml:lang="pl">(Asnyk, Adam) El...y (1838-1897), Poezye, t. 3, Gebethner i Wolff, wyd. nowe poprzedzone słowem wstępnym St. Krzemińskiego, Warszawa, 1898</dc:source>
+<dc:rights xml:lang="pl">Domena publiczna - Adam Asnyk zm. 1897</dc:rights>
+<dc:date.pd xml:lang="pl">1897</dc:date.pd>
+<dc:format xml:lang="pl">xml</dc:format>
+<dc:type xml:lang="pl">text</dc:type>
+<dc:type xml:lang="en">text</dc:type>
+<dc:date xml:lang="pl">2007-09-06</dc:date>
+<dc:audience xml:lang="pl">L</dc:audience>
+<dc:language xml:lang="pl">pol</dc:language>
+</rdf:Description>
+</rdf:RDF>
+
+
+<autor_utworu>Adam Asnyk</autor_utworu>
+
+<nazwa_utworu><begin id="b1189062500041"/><motyw id="m1189062500041">Miłość platoniczna</motyw>Między nami nic nie było</nazwa_utworu>
+
+
+
+<strofa>Między nami nic nie było!/
+Żadnych zwierzeń, wyznań żadnych!/
+Nic nas z sobą nie łączyło ---/
+Prócz wiosennych marzeń zdradnych;</strofa>
+
+
+
+<strofa><begin id="b1189062528872"/><motyw id="m1189062528872">Natura</motyw>Prócz tych woni, barw i blasków,/
+Unoszących się w przestrzeni;/
+Prócz szumiących śpiewem lasków/
+I tej świeżej łąk zieleni;</strofa>
+
+
+
+<strofa>Prócz tych kaskad i potoków,/
+Zraszających każdy parów,/
+Prócz girlandy tęcz, obłoków,/
+Prócz natury słodkich czarów;</strofa>
+
+
+
+<strofa>Prócz tych wspólnych, jasnych zdrojów,/
+Z których serce zachwyt piło;/
+Prócz pierwiosnków i powojów,---/
+Między nami nic nie było!<end id="e1189062528872"/><end id="e1189062500041"/></strofa>
+
+</liryka_lp>
+</utwor>
--- /dev/null
+<div xmlns:wl="http://wolnelektury.pl/functions" id="book-text">
+ <div id="toc">
+ <h2>Spis treści</h2>
+ <ol/>
+ </div>
+ <h1>
+ <span class="author">Adam Asnyk</span>
+ <span class="title"><a name="m1189062500041" class="theme-begin" fid="1189062500041">Miłość platoniczna</a>Między nami nic nie było</span>
+ </h1>
+ <a name="m1189062500041" class="theme-begin" fid="1189062500041">Miłość platoniczna</a>
+ <div class="stanza">
+ <p class="verse"><a name="f1" class="target"> </a><a href="#f1" class="anchor">1</a>Między nami nic nie było!</p>
+ <p class="verse">
+Żadnych zwierzeń, wyznań żadnych!</p>
+ <p class="verse">
+Nic nas z sobą nie łączyło —</p>
+ <p class="verse">
+Prócz wiosennych marzeń zdradnych;</p>
+ </div>
+ <div class="stanza">
+ <p class="verse"><a name="f5" class="target"> </a><a href="#f5" class="anchor">5</a><a name="m1189062528872" class="theme-begin" fid="1189062528872">Natura</a>Prócz tych woni, barw i blasków,</p>
+ <p class="verse">
+Unoszących się w przestrzeni;</p>
+ <p class="verse">
+Prócz szumiących śpiewem lasków</p>
+ <p class="verse">
+I tej świeżej łąk zieleni;</p>
+ </div>
+ <div class="stanza">
+ <p class="verse">Prócz tych kaskad i potoków,</p>
+ <p class="verse"><a name="f10" class="target"> </a><a href="#f10" class="anchor">10</a>
+Zraszających każdy parów,</p>
+ <p class="verse">
+Prócz girlandy tęcz, obłoków,</p>
+ <p class="verse">
+Prócz natury słodkich czarów;</p>
+ </div>
+ <div class="stanza">
+ <p class="verse">Prócz tych wspólnych, jasnych zdrojów,</p>
+ <p class="verse">
+Z których serce zachwyt piło;</p>
+ <p class="verse"><a name="f15" class="target"> </a><a href="#f15" class="anchor">15</a>
+Prócz pierwiosnków i powojów,—</p>
+ <p class="verse">
+Między nami nic nie było!<span class="theme-end" fid="1189062528872"/><span class="theme-end" fid="1189062500041"/></p>
+ </div>
+</div>
--- /dev/null
+Kodowanie znaków w dokumencie: UTF-8.
+-----
+Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez
+Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje
+się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać.
+
+Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie http://wolnelektury.pl/katalog/lektura/miedzy-nami-nic-nie-bylo.
+-----
+
+
+
+
+
+Adam Asnyk
+
+Między nami nic nie było
+
+
+
+Między nami nic nie było!
+Żadnych zwierzeń, wyznań żadnych!
+Nic nas z sobą nie łączyło —
+Prócz wiosennych marzeń zdradnych;
+
+Prócz tych woni, barw i blasków,
+Unoszących się w przestrzeni;
+Prócz szumiących śpiewem lasków
+I tej świeżej łąk zieleni;
+
+Prócz tych kaskad i potoków,
+Zraszających każdy parów,
+Prócz girlandy tęcz, obłoków,
+Prócz natury słodkich czarów;
+
+Prócz tych wspólnych, jasnych zdrojów,
+Z których serce zachwyt piło;
+Prócz pierwiosnków i powojów,—
+Między nami nic nie było!
--- /dev/null
+<?xml version='1.0' encoding='utf-8'?>
+<utwor><liryka_lp>
+ <autor_utworu>Adam Asnyk</autor_utworu>
+ <nazwa_utworu>Między nami nic nie było</nazwa_utworu>
+
+ <strofa>Między nami nic nie było!/
+ Żadnych zwierzeń, wyznań żadnych!/
+ Nic nas z sobą nie łączyło ---/
+ Prócz wiosennych marzeń zdradnych;</strofa>
+
+ <strofa>Prócz tych woni, barw i blasków,/
+ Unoszących się w przestrzeni;/
+ Prócz szumiących śpiewem lasków/
+ I tej świeżej łąk zieleni;</strofa>
+
+ <strofa>Prócz tych kaskad i potoków,/
+ Zraszających każdy parów,/
+ Prócz girlandy tęcz, obłoków,/
+ Prócz natury słodkich czarów;</strofa>
+
+ <strofa>Prócz tych wspólnych, jasnych zdrojów,/
+ Z których serce zachwyt piło;/
+ Prócz pierwiosnków i powojów,---/
+ Między nami nic nie było!</strofa>
+</liryka_lp></utwor>
-#!/usr/bin/env python
# -*- coding: utf-8 -*-
-
-import unittest
-
+from librarian import dcparser
from lxml import etree
-from utils import get_file_path
-from librarian import dcparser, html, ParseError
-from utils import AutoTestMetaclass
-
-class TestDCParser(unittest.TestCase):
- __metaclass__ = AutoTestMetaclass
+from nose.tools import *
+from os.path import splitext
+from tests.utils import get_all_fixtures
+import codecs
- TEST_DIR = 'dcparser'
- def run_auto_test(self, in_data, out_data):
- info = dcparser.BookInfo.from_string(in_data).to_dict()
- should_be = eval(out_data)
- for key in should_be:
- self.assertEqual( info[key], should_be[key] )
+def check_dcparser(xml_file, result_file):
+ xml = file(xml_file).read()
+ result = codecs.open(result_file, encoding='utf-8').read()
+ info = dcparser.BookInfo.from_string(xml).to_dict()
+ should_be = eval(result)
+ for key in should_be:
+ assert_equals(info[key], should_be[key])
-class TestDCSerialize(unittest.TestCase):
- __metaclass__ = AutoTestMetaclass
- TEST_DIR = 'dcserialize'
+def test_dcparser():
+ for fixture in get_all_fixtures('dcparser', '*.xml'):
+ base_name = splitext(fixture)[0]
+ yield check_dcparser, fixture, base_name + '.out'
- def run_auto_test(self, in_data, out_data):
- import lxml.etree
- # first parse the input
- info = dcparser.BookInfo.from_string(in_data)
- # serialize
- serialized = lxml.etree.tostring(info.to_etree(), encoding=unicode).encode('utf-8')
+def check_serialize(xml_file):
+ xml = file(xml_file).read()
+ info = dcparser.BookInfo.from_string(xml)
- # then parse again
- info_bis = dcparser.BookInfo.from_string(serialized)
+ # serialize
+ serialized = etree.tostring(info.to_etree(), encoding=unicode).encode('utf-8')
+ # then parse again
+ info_bis = dcparser.BookInfo.from_string(serialized)
- # check if they are the same
- for key in vars(info):
- self.assertEqual( getattr(info, key), getattr(info_bis, key))
+ # check if they are the same
+ for key in vars(info):
+ assert_equals(getattr(info, key), getattr(info_bis, key))
+ for key in vars(info_bis):
+ assert_equals(getattr(info, key), getattr(info_bis, key))
- for key in vars(info_bis):
- self.assertEqual( getattr(info, key), getattr(info_bis, key))
-class TestParserErrors(unittest.TestCase):
- def test_error(self):
- try:
- html.transform(get_file_path('erroneous', 'asnyk_miedzy_nami.xml'),
- get_file_path('erroneous', 'asnyk_miedzy_nami.html'))
- self.fail()
- except ParseError:
- pass
- #self.assertEqual(e.position, (25, 13))
+def test_serialize():
+ for fixture in get_all_fixtures('dcparser', '*.xml'):
+ yield check_serialize, fixture
-if __name__ == '__main__':
- unittest.main()
--- /dev/null
+# -*- coding: utf-8 -*-
+from librarian import html, NoDublinCore
+from nose.tools import *
+from utils import get_fixture, remove_output_file
+
+
+def teardown_transform():
+ remove_output_file('text', 'asnyk_miedzy_nami.html')
+
+
+@with_setup(None, teardown_transform)
+def test_transform():
+ output_file_path = get_fixture('text', 'asnyk_miedzy_nami.html')
+ expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html')
+
+ html.transform(
+ get_fixture('text', 'asnyk_miedzy_nami.xml'),
+ output_file_path,
+ )
+
+ assert_equal(file(output_file_path).read(), file(expected_output_file_path).read())
+
+
+@with_setup(None, teardown_transform)
+@raises(NoDublinCore)
+def test_no_dublincore():
+ html.transform(
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.html'),
+ )
+
+
+@with_setup(None, teardown_transform)
+def test_passing_parse_dublincore_to_transform():
+ """Passing parse_dublincore=False to transform omits DublinCore parsing."""
+ html.transform(
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
+ get_fixture('text', 'asnyk_miedzy_nami.html'),
+ parse_dublincore=False,
+ )
-#!/usr/bin/env python
-# encoding: utf-8
+# -*- coding: utf-8 -*-
+from librarian import text, NoDublinCore
+from nose.tools import *
+from utils import get_fixture, remove_output_file
-import unittest
-from utils import get_file_path
-from librarian import dcparser
-from librarian import text, NoDublinCore
+def teardown_transform():
+ remove_output_file('text', 'asnyk_miedzy_nami.txt')
+
+
+@with_setup(None, teardown_transform)
+def test_transform():
+ output_file_path = get_fixture('text', 'asnyk_miedzy_nami.txt')
+ expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt')
+
+ text.transform(
+ get_fixture('text', 'asnyk_miedzy_nami.xml'),
+ output_file_path,
+ )
+
+ assert_equal(file(output_file_path).read(), file(expected_output_file_path).read())
-class TestXML(unittest.TestCase):
- def test_no_dublincore(self):
- try:
- text.transform(get_file_path('text', 'asnyk_miedzy_nami.xml'),
- get_file_path('text', 'asnyk_miedzy_nami.txt'))
- self.fail()
- except NoDublinCore, e:
- pass
+@with_setup(None, teardown_transform)
+@raises(NoDublinCore)
+def test_no_dublincore():
+ text.transform(
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.txt'),
+ )
-if __name__ == '__main__':
- unittest.main()
+@with_setup(None, teardown_transform)
+def test_passing_parse_dublincore_to_transform():
+ """Passing parse_dublincore=False to transform omits DublinCore parsing."""
+ text.transform(
+ get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'),
+ get_fixture('text', 'asnyk_miedzy_nami.txt'),
+ parse_dublincore=False,
+ )
+
\ No newline at end of file
-from __future__ import with_statement
-
+from os.path import realpath, join, dirname
+import glob
import os
-from distutils.core import Command
-from unittest import TextTestRunner, TestLoader
-from glob import glob
-from os.path import dirname, join, realpath, splitext, basename, walk
-from os import listdir
-import codecs
-
-class AutoTestMetaclass(type):
-
- def __new__(cls, name, bases, class_dict):
- test_dir = class_dict.pop('TEST_DIR')
- path = realpath( join(dirname(__file__), 'files', test_dir) )
- for file in listdir(path):
- base, ext = splitext(file)
- if ext != '.xml':
- continue
- class_dict['test_'+base] = cls.make_test_runner(base, \
- join(path, base +'.xml'), join(path, base + '.out') )
+def get_fixture_dir(dir_name):
+ """Returns path to fixtures directory dir_name."""
+ return realpath(join(dirname(__file__), 'files', dir_name))
- return type.__new__(cls, name, bases, class_dict)
-
- @staticmethod
- def make_test_runner(name, inputf, outputf):
- def runner(self):
- with open(inputf, 'rb') as ifd:
- with codecs.open(outputf, 'rb', encoding='utf-8') as ofd:
- self.run_auto_test(ifd.read(), ofd.read())
- return runner
+def get_fixture(dir_name, file_name):
+ """Returns path to fixture file_name in directory dir_name."""
+ return join(get_fixture_dir(dir_name), file_name)
-def get_file_path(dir_name, file_name):
- return realpath(join(dirname(__file__), 'files', dir_name, file_name))
-class TestCommand(Command):
- user_options = []
+def get_all_fixtures(dir_name, glob_pattern='*'):
+ """Returns list of paths for fixtures in directory dir_name matching the glob_pattern."""
+ return [get_fixture(dir_name, file_name) for file_name in glob.glob(join(get_fixture_dir(dir_name), glob_pattern))]
- def initialize_options(self):
- self._dir = os.getcwd()
- def finalize_options(self):
+def remove_output_file(dir_name, file_name):
+ try:
+ os.remove(get_fixture(dir_name, file_name))
+ except:
pass
-
- def run(self):
- '''
- Finds all the tests modules in tests/, and runs them.
- '''
- testfiles = []
- for t in glob(join(self._dir, 'tests', '*.py')):
- module_name = splitext(basename(t))[0]
- if module_name.startswith('test'):
- testfiles.append('.'.join(['tests', module_name])
- )
-
- tests = TestLoader().loadTestsFromNames(testfiles)
- t = TextTestRunner(verbosity=2)
- t.run(tests)
-