.DS_Store
*.pyc
+*~
MANIFEST
dist
build
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
class ParseError(Exception):
pass
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from xml.parsers.expat import ExpatError
from datetime import date
def __init__(self, last_name, *first_names):
self.last_name = last_name
self.first_names = first_names
-
+
@classmethod
def from_text(cls, text):
parts = [ token.strip() for token in text.split(',') ]
raise ValueError("Found a comma, but no names given: \"%s\" -> %r." % (text, parts))
names = [ name for name in parts[1].split() if len(name) ] # all non-whitespace tokens
return cls(surname, *names)
-
+
def __eq__(self, right):
return self.last_name == right.last_name and self.first_names == right.first_names
-
-
+
+
def __unicode__(self):
if len(self.first_names) > 0:
return '%s, %s' % (self.last_name, ' '.join(self.first_names))
else:
return self.last_name
-
+
def __repr__(self):
return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names)
-class BookInfo(object):
+class BookInfo(object):
FIELDS = (
Field( DCNS('creator'), 'author', as_person),
Field( DCNS('title'), 'title'),
Field( DCNS('rights.license'), 'license', required=False),
Field( DCNS('rights'), 'license_description'),
)
-
+
@property
def slug(self):
return self.url.rsplit('/', 1)[1]
def from_string(cls, xml):
from StringIO import StringIO
return cls.from_file(StringIO(xml))
-
+
@classmethod
def from_file(cls, xmlfile):
- desc_tag = None
+ desc_tag = None
try:
- iter = etree.iterparse(xmlfile, ['start', 'end'])
+ iter = etree.iterparse(xmlfile, ['start', 'end'])
for (event, element) in iter:
if element.tag == RDFNS('RDF') and event == 'start':
desc_tag = element
break
# if there is no end, Expat should yell at us with an ExpatError
-
+
# extract data from the element and make the info
return cls.from_element(desc_tag)
except XMLSyntaxError, e:
# the tree is already parsed, so we don't need to worry about Expat errors
field_dict = {}
desc = rdf_tag.find(".//" + RDFNS('Description') )
-
+
if desc is None:
raise NoDublinCore("No DublinCore section found.")
fv = field_dict.get(e.tag, [])
fv.append(e.text)
field_dict[e.tag] = fv
-
+
return cls( desc.attrib, field_dict )
def __init__(self, rdf_attrs, dc_fields):
"""rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description.
- dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the
+ dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the
given field. """
self.about = rdf_attrs.get(RDFNS('about'))
else: # singular alias
if not field.multiple:
raise "OUCH!! for field %s" % name
-
+
return value[0]
except (KeyError, AttributeError):
return object.__getattribute__(self, name)
return object.__setattr__(self, name, newvalue)
def update(self, field_dict):
- """Update using field_dict. Verify correctness, but don't check if all
+ """Update using field_dict. Verify correctness, but don't check if all
required fields are present."""
for field in self.FIELDS:
if field_dict.has_key(field.name):
"""XML representation of this object."""
#etree._namespace_map[str(self.RDF)] = 'rdf'
#etree._namespace_map[str(self.DC)] = 'dc'
-
+
if parent is None:
root = etree.Element(RDFNS('RDF'))
else:
root = parent.makeelement(RDFNS('RDF'))
description = etree.SubElement(root, RDFNS('Description'))
-
+
if self.about:
description.set(RDFNS('about'), self.about)
-
+
for field in self.FIELDS:
v = getattr(self, field.name, None)
if v is not None:
e = etree.Element(field.uri)
e.text = unicode(v)
description.append(e)
-
+
return root
v = [ unicode(x) for x in v if v is not None ]
else:
v = unicode(v)
-
+
dc[field.name] = {'uri': field.uri, 'value': v}
rdf['fields'] = dc
return rdf
if field.salias:
v = getattr(self, field.salias)
if v is not None: result[field.salias] = unicode(v)
-
+
return result
def parse(file_name):
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
import os
import cStringIO
def transform(input, output_filename=None, is_file=True, \
parse_dublincore=True, stylesheet='legacy', options={}):
"""Transforms file input_filename in XML to output_filename in XHTML.
-
+
If output_filename is None, returns an XML,
otherwise returns True if file has been written,False if it hasn't.
File won't be written if it has no content.
parse_dublincore=parse_dublincore)
result = document.transform(style, **options)
- del document # no longer needed large object :)
-
+ del document # no longer needed large object :)
+
if html_has_content(result):
add_anchors(result.getroot())
add_table_of_contents(result.getroot())
-
+
if output_filename is not None:
result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
else:
anchor.tail = element.text
element.text = u''
element.insert(0, anchor)
-
+
if with_target:
anchor_target = etree.Element('a', name='%s' % prefix)
anchor_target.set('class', 'target')
if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
or e.tag == 'blockquote'):
continue
-
+
if element.tag == 'p' and 'verse' in element.get('class', ''):
if counter == 1 or counter % 5 == 0:
add_anchor(element, "f%d" % counter, link_text=counter)
if element.tag in ('h2', 'h3'):
if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
continue
-
+
if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
else:
sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
add_anchor(element, "s%d" % counter, with_link=False)
counter += 1
-
+
toc = etree.Element('div')
toc.set('id', 'toc')
toc_header = etree.SubElement(toc, 'h2')
for n, section, text, subsections in sections:
section_element = etree.SubElement(toc_list, 'li')
add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
-
+
if len(subsections):
subsection_list = etree.SubElement(section_element, 'ol')
for n, subsection, text, _ in subsections:
subsection_element = etree.SubElement(subsection_list, 'li')
add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
-
+
root.insert(0, toc)
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from librarian import ValidationError, NoDublinCore, ParseError
from librarian import RDFNS
self.edoc = edoc
root_elem = edoc.getroot()
-
+
dc_path = './/' + RDFNS('RDF')
-
+
if root_elem.tag != 'utwor':
raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
if self.rdf_elem is None:
raise NoDublinCore('Document has no DublinCore - which is required.')
-
+
self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
else:
self.book_info = None
-
+
@classmethod
def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
if swap_endlines:
data = cls.LINE_SWAP_EXPR.sub(u'<br />\n', data)
-
+
try:
parser = etree.XMLParser(remove_blank_text=False)
return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
- raise ParseError(e)
+ raise ParseError(e)
def chunk(self, path):
- # convert the path to XPath
+ # convert the path to XPath
expr = self.path_to_xpath(path)
elems = self.edoc.xpath(expr)
if len(elems) == 0:
return None
- else:
+ else:
return elems[0]
def path_to_xpath(self, path):
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from librarian import dcparser, parser
from lxml import etree
text = ''.join(text)
if not wrapping:
return text
-
+
words = re.split(r'\s', text)
-
+
line_length = 0
lines = [[]]
for word in words:
result = document.transform(style, **options)
output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
-
+
if parse_dublincore:
url = dcparser.parse(input_filename).url
else:
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from __future__ import with_statement
if m is not None:
return m.group(1)
else:
- return text
-
+ return text
+
def filter_verse_ends(data):
return data.replace('/\n', '<br />')
def normalize_stylesheet():
return etree.XSLT(etree.parse(os.path.join(os.path.dirname(librarian.__file__), 'xslt', 'normalize.xslt')))
-if __name__ == '__main__':
+if __name__ == '__main__':
tran = normalize_stylesheet()
input = StringIO( f )
doc = trans( etree.parse(input) )
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from distutils.core import setup
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from librarian import dcparser
from lxml import etree
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from librarian import html, NoDublinCore
from nose.tools import *
def test_transform():
output_file_path = get_fixture('text', 'asnyk_miedzy_nami.html')
expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.html')
-
+
html.transform(
get_fixture('text', 'asnyk_miedzy_nami.xml'),
output_file_path,
)
-
+
assert_equal(file(output_file_path).read(), file(expected_output_file_path).read())
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from librarian import text, NoDublinCore
from nose.tools import *
def test_transform():
output_file_path = get_fixture('text', 'asnyk_miedzy_nami.txt')
expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt')
-
+
text.transform(
get_fixture('text', 'asnyk_miedzy_nami.xml'),
output_file_path,
)
-
+
assert_equal(file(output_file_path).read(), file(expected_output_file_path).read())
get_fixture('text', 'asnyk_miedzy_nami.txt'),
parse_dublincore=False,
)
-
\ No newline at end of file
# -*- coding: utf-8 -*-
#
# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from __future__ import with_statement
from os.path import realpath, join, dirname