From: Marek Stępniowski Date: Fri, 19 Mar 2010 15:59:37 +0000 (+0100) Subject: Extracted from project "Wolnelektury.pl". Version 1.1 X-Git-Tag: 1.7~300 X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/07fdba2c7fe8e11b6867712d47bdd608e88c29fb?hp=eaa6a2272807a53277a845f127061a1c229ef58e Extracted from project "Wolnelektury.pl". Version 1.1 --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7189e7b --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.DS_Store +*.pyc +MANIFEST +dist diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..4c76fc3 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include librarian/*.xslt +recursive-include librarian/tests/files/ *.xml diff --git a/librarian/__init__.py b/librarian/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/librarian/__init__.pyc b/librarian/__init__.pyc new file mode 100644 index 0000000..3d4eb13 Binary files /dev/null and b/librarian/__init__.pyc differ diff --git a/librarian/book2html.xslt b/librarian/book2html.xslt new file mode 100644 index 0000000..71f1182 --- /dev/null +++ b/librarian/book2html.xslt @@ -0,0 +1,615 @@ + + + + + + + +
+ + +
+

Przypisy

+ +
+ + [] + + +

+
+ + + +
+
+
+
+
+
+ +
+ + + + + + + + +

+ +

+
+ +
+ + + + + + + +
+
+ + +
+

+
    + +
+
+
+ + +
+
+ + +
+ +
+
+ + +
+
+ + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + +

+
+ + +

+
+ + +

+
+ + + +

+
+ + +
+
+ + +
  • +
    + + +

    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +

    + + + padding-left: 1em + + + + + padding-left: em + + + padding-left: 1em + + + + + padding-left: 12em + + + +

    +
    + + +

    +
    + + + + + + + + + + [] + + + + + + + + + + + + + + + + + + „” + + + + + + + + + + + + + + + + + +
    +
    + + +

    *

    +
    + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + diff --git a/librarian/book2txt.xslt b/librarian/book2txt.xslt new file mode 100644 index 0000000..cd98524 --- /dev/null +++ b/librarian/book2txt.xslt @@ -0,0 +1,321 @@ + + + + + + + + + +Kodowanie znaków w dokumencie: UTF-8. +----- +Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez +Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje +się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać. + +Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %s. +----- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +/ / + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +„” + + + +** + + + + + + + + + + + + + + + + + + + + + + +* + + + + + + + +------------------------------------------------ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/librarian/dcparser.py b/librarian/dcparser.py new file mode 100644 index 0000000..557509c --- /dev/null +++ b/librarian/dcparser.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +from xml.parsers.expat import ExpatError +from datetime import date +import time + +# Import ElementTree from anywhere +try: + import xml.etree.ElementTree as etree # Python >= 2.5 +except ImportError: + try: + import elementtree.ElementTree as etree # effbot's pure Python module + except ImportError: + import lxml.etree as etree # ElementTree API using libxml2 + + +# ============== +# = Converters = +# ============== +class Person(object): + """Single person with last name and a list of first names.""" + def __init__(self, last_name, *first_names): + self.last_name = last_name + self.first_names = first_names + + + def __eq__(self, right): + return self.last_name == right.last_name and self.first_names == right.first_names + + + def __unicode__(self): + if len(self.first_names) > 0: + return '%s, %s' % (self.last_name, ' '.join(self.first_names)) + else: + return self.last_name + + + def __repr__(self): + return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names) + + +def str_to_unicode(value, previous): + return unicode(value) + + +def str_to_unicode_list(value, previous): + if previous is None: + previous = [] + previous.append(str_to_unicode(value, None)) + return previous + + +def str_to_person(value, previous): + comma_count = value.count(',') + + if comma_count == 0: + last_name, first_names = value, [] + elif comma_count == 1: + last_name, first_names = value.split(',') + first_names = [name for name in first_names.split(' ') if len(name)] + else: + raise ValueError("value contains more than one comma: %r" % value) + + return Person(last_name.strip(), *first_names) + + +def str_to_date(value, previous): + try: + t = time.strptime(value, '%Y-%m-%d') + except ValueError: + t = time.strptime(value, '%Y') + return date(t[0], t[1], t[2]) + + +# ========== +# = Parser = +# ========== +class ParseError(Exception): + def __init__(self, message): + super(ParseError, self).__init__(message) + + +class XMLNamespace(object): + '''Represents XML namespace.''' + + def __init__(self, uri): + self.uri = uri + + def __call__(self, tag): + return '{%s}%s' % (self.uri, tag) + + def __contains__(self, tag): + return tag.startswith(str(self)) + + def __repr__(self): + return 'XMLNamespace(%r)' % self.uri + + def __str__(self): + return '%s' % self.uri + + +class BookInfo(object): + RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') + DC = XMLNamespace('http://purl.org/dc/elements/1.1/') + + mapping = { + DC('creator') : ('author', str_to_person), + DC('title') : ('title', str_to_unicode), + DC('subject.period') : ('epoch', str_to_unicode), + DC('subject.type') : ('kind', str_to_unicode), + DC('subject.genre') : ('genre', str_to_unicode), + DC('date') : ('created_at', str_to_date), + DC('date.pd') : ('released_to_public_domain_at', str_to_date), + DC('contributor.translator') : ('translator', str_to_person), + DC('contributor.technical_editor') : ('technical_editor', str_to_person), + DC('publisher') : ('publisher', str_to_unicode), + DC('source') : ('source_name', str_to_unicode), + DC('source.URL') : ('source_url', str_to_unicode), + DC('identifier.url') : ('url', str_to_unicode), + DC('relation.hasPart') : ('parts', str_to_unicode_list), + DC('rights.license') : ('license', str_to_unicode), + DC('rights') : ('license_description', str_to_unicode), + } + + @classmethod + def from_string(cls, xml): + from StringIO import StringIO + return cls.from_file(StringIO(xml)) + + @classmethod + def from_file(cls, xml_file): + book_info = cls() + + try: + tree = etree.parse(xml_file) + except ExpatError, e: + raise ParseError(e) + + description = tree.find('//' + book_info.RDF('Description')) + book_info.wiki_url = description.get(cls.RDF('about'), None) + + if description is None: + raise ParseError('no Description tag found in document') + + for element in description.findall('*'): + book_info.parse_element(element) + + return book_info + + def parse_element(self, element): + try: + attribute, converter = self.mapping[element.tag] + setattr(self, attribute, converter(element.text, getattr(self, attribute, None))) + except KeyError: + pass + + def to_xml(self): + """XML representation of this object.""" + etree._namespace_map[str(self.RDF)] = 'rdf' + etree._namespace_map[str(self.DC)] = 'dc' + + root = etree.Element(self.RDF('RDF')) + description = etree.SubElement(root, self.RDF('Description')) + + if self.wiki_url: + description.set(self.RDF('about'), self.wiki_url) + + for tag, (attribute, converter) in self.mapping.iteritems(): + if hasattr(self, attribute): + e = etree.Element(tag) + e.text = unicode(getattr(self, attribute)) + description.append(e) + + return unicode(etree.tostring(root, 'utf-8'), 'utf-8') + + def to_dict(self): + etree._namespace_map[str(self.RDF)] = 'rdf' + etree._namespace_map[str(self.DC)] = 'dc' + + result = {'about': self.wiki_url} + for tag, (attribute, converter) in self.mapping.iteritems(): + if hasattr(self, attribute): + result[attribute] = unicode(getattr(self, attribute)) + + return result + + +def parse(file_name): + return BookInfo.from_file(file_name) + + +if __name__ == '__main__': + import sys + + info = parse(sys.argv[1]) + for attribute, _ in BookInfo.mapping.values(): + print '%s: %r' % (attribute, getattr(info, attribute, None)) + diff --git a/librarian/dcparser.pyc b/librarian/dcparser.pyc new file mode 100644 index 0000000..0e911b8 Binary files /dev/null and b/librarian/dcparser.pyc differ diff --git a/librarian/html.py b/librarian/html.py new file mode 100644 index 0000000..b279e5d --- /dev/null +++ b/librarian/html.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +import os +import cStringIO +import re +import copy +import pkgutil + +from lxml import etree + + +ENTITY_SUBSTITUTIONS = [ + (u'---', u'—'), + (u'--', u'–'), + (u'...', u'…'), + (u',,', u'„'), + (u'"', u'”'), +] + + +def substitute_entities(context, text): + """XPath extension function converting all entites in passed text.""" + if isinstance(text, list): + text = ''.join(text) + for entity, substitutution in ENTITY_SUBSTITUTIONS: + text = text.replace(entity, substitutution) + return text + + +# Register substitute_entities function with lxml +ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') +ns['substitute_entities'] = substitute_entities + + +def transform(input_filename, output_filename): + """Transforms file input_filename in XML to output_filename in XHTML.""" + # Parse XSLT + style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt') + style = etree.parse(style_filename) + + doc_file = cStringIO.StringIO() + expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); + + f = open(input_filename, 'r') + for line in f: + line = line.decode('utf-8') + line = expr.sub(u'
    \n', line) + doc_file.write(line.encode('utf-8')) + f.close() + + doc_file.seek(0); + + parser = etree.XMLParser(remove_blank_text=True) + doc = etree.parse(doc_file, parser) + + result = doc.xslt(style) + if result.find('//p') is not None: + add_anchors(result.getroot()) + add_table_of_contents(result.getroot()) + result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8') + return True + else: + return False + + +class Fragment(object): + def __init__(self, id, themes): + super(Fragment, self).__init__() + self.id = id + self.themes = themes + self.events = [] + + def append(self, event, element): + self.events.append((event, element)) + + def closed_events(self): + stack = [] + for event, element in self.events: + if event == 'start': + stack.append(('end', element)) + elif event == 'end': + try: + stack.pop() + except IndexError: + print 'CLOSED NON-OPEN TAG:', element + + stack.reverse() + return self.events + stack + + def to_string(self): + result = [] + for event, element in self.closed_events(): + if event == 'start': + result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) + if element.text: + result.append(element.text) + elif event == 'end': + result.append(u'' % element.tag) + if element.tail: + result.append(element.tail) + else: + result.append(element) + + return ''.join(result) + + def __unicode__(self): + return self.to_string() + + +def extract_fragments(input_filename): + """Extracts theme fragments from input_filename.""" + open_fragments = {} + closed_fragments = {} + + for event, element in etree.iterparse(input_filename, events=('start', 'end')): + # Process begin and end elements + if element.get('class', '') in ('theme-begin', 'theme-end'): + if not event == 'end': continue # Process elements only once, on end event + + # Open new fragment + if element.get('class', '') == 'theme-begin': + fragment = Fragment(id=element.get('fid'), themes=element.text) + + # Append parents + if element.getparent().get('id', None) != 'book-text': + parents = [element.getparent()] + while parents[-1].getparent().get('id', None) != 'book-text': + parents.append(parents[-1].getparent()) + + parents.reverse() + for parent in parents: + fragment.append('start', parent) + + open_fragments[fragment.id] = fragment + + # Close existing fragment + else: + try: + fragment = open_fragments[element.get('fid')] + except KeyError: + print '%s:closed not open fragment #%s' % (input_filename, element.get('fid')) + else: + closed_fragments[fragment.id] = fragment + del open_fragments[fragment.id] + + # Append element tail to lost_text (we don't want to lose any text) + if element.tail: + for fragment_id in open_fragments: + open_fragments[fragment_id].append('text', element.tail) + + + # Process all elements except begin and end + else: + # Omit annotation tags + if len(element.get('name', '')) or element.get('class', '') == 'annotation': + if event == 'end' and element.tail: + for fragment_id in open_fragments: + open_fragments[fragment_id].append('text', element.tail) + else: + for fragment_id in open_fragments: + open_fragments[fragment_id].append(event, copy.copy(element)) + + return closed_fragments, open_fragments + + +def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None): + if with_link: + if link_text is None: + link_text = prefix + anchor = etree.Element('a', href='#%s' % prefix) + anchor.set('class', 'anchor') + anchor.text = unicode(link_text) + if element.text: + anchor.tail = element.text + element.text = u'' + element.insert(0, anchor) + + if with_target: + anchor_target = etree.Element('a', name='%s' % prefix) + anchor_target.set('class', 'target') + anchor_target.text = u' ' + if element.text: + anchor_target.tail = element.text + element.text = u'' + element.insert(0, anchor_target) + + +def any_ancestor(element, test): + for ancestor in element.iterancestors(): + if test(ancestor): + return True + return False + + +def add_anchors(root): + counter = 1 + for element in root.iterdescendants(): + if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication') + or e.tag == 'blockquote'): + continue + + if element.tag == 'p' and 'verse' in element.get('class', ''): + if counter == 1 or counter % 5 == 0: + add_anchor(element, "f%d" % counter, link_text=counter) + counter += 1 + elif 'paragraph' in element.get('class', ''): + add_anchor(element, "f%d" % counter, link_text=counter) + counter += 1 + + +def add_table_of_contents(root): + sections = [] + counter = 1 + for element in root.iterdescendants(): + if element.tag in ('h2', 'h3'): + if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)): + continue + + if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2': + sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), [])) + else: + sections.append((counter, element.tag, ''.join(element.xpath('text()')), [])) + add_anchor(element, "s%d" % counter, with_link=False) + counter += 1 + + toc = etree.Element('div') + toc.set('id', 'toc') + toc_header = etree.SubElement(toc, 'h2') + toc_header.text = u'Spis treści' + toc_list = etree.SubElement(toc, 'ol') + + for n, section, text, subsections in sections: + section_element = etree.SubElement(toc_list, 'li') + add_anchor(section_element, "s%d" % n, with_target=False, link_text=text) + + if len(subsections): + subsection_list = etree.SubElement(section_element, 'ol') + for n, subsection, text, _ in subsections: + subsection_element = etree.SubElement(subsection_list, 'li') + add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text) + + root.insert(0, toc) + diff --git a/librarian/html.pyc b/librarian/html.pyc new file mode 100644 index 0000000..dfb837d Binary files /dev/null and b/librarian/html.pyc differ diff --git a/librarian/tests/__init__.py b/librarian/tests/__init__.py new file mode 100644 index 0000000..3f02541 --- /dev/null +++ b/librarian/tests/__init__.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +import unittest +from os.path import dirname, join, realpath + +from lxml import etree +from librarian import dcparser, html + + +def test_file_path(dir_name, file_name): + return realpath(join(dirname(__file__), 'files', dir_name, file_name)) + + +class TestDCParser(unittest.TestCase): + KNOWN_RESULTS = ( + ('dcparser', 'andersen_brzydkie_kaczatko.xml', { + 'publisher': u'Fundacja Nowoczesna Polska', + 'about': u'http://wiki.wolnepodreczniki.pl/Lektury:Andersen/Brzydkie_kaczątko', + 'source_name': u'Andersen, Hans Christian (1805-1875), Baśnie, Gebethner i Wolff, wyd. 7, Kraków, 1925', + 'author': u'Andersen, Hans Christian', + 'url': u'http://wolnelektury.pl/katalog/lektura/brzydkie-kaczatko', + 'created_at': u'2007-08-14', + 'title': u'Brzydkie kaczątko', + 'kind': u'Epika', + 'source_url': u'http://www.polona.pl/dlibra/doccontent2?id=3563&dirids=4', + 'translator': u'Niewiadomska, Cecylia', + 'released_to_public_domain_at': u'1925-01-01', + 'epoch': u'Romantyzm', + 'genre': u'Baśń', + 'technical_editor': u'Gałecki, Dariusz', + 'license_description': u'Domena publiczna - tłumacz Cecylia Niewiadomska zm. 1925', + }), + ('dcparser', 'kochanowski_piesn7.xml', { + 'publisher': u'Fundacja Nowoczesna Polska', + 'about': u'http://wiki.wolnepodreczniki.pl/Lektury:Kochanowski/Pieśni/Pieśń_VII_(1)', + 'source_name': u'Kochanowski, Jan (1530-1584), Dzieła polskie, tom 1, oprac. Julian Krzyżanowski, wyd. 8, Państwowy Instytut Wydawniczy, Warszawa, 1976', + 'author': u'Kochanowski, Jan', + 'url': u'http://wolnelektury.pl/katalog/lektura/piesni-ksiegi-pierwsze-piesn-vii-trudna-rada-w-tej-mierze-pr', + 'created_at': u'2007-08-31', + 'title': u'Pieśń VII (Trudna rada w tej mierze: przyjdzie się rozjechać...)', + 'kind': u'Liryka', + 'source_url': u'http://www.polona.pl/Content/1499', + 'released_to_public_domain_at': u'1584-01-01', + 'epoch': u'Renesans', + 'genre': u'Pieśń', + 'technical_editor': u'Gałecki, Dariusz', + 'license_description': u'Domena publiczna - Jan Kochanowski zm. 1584 ', + }), + ('dcparser', 'mickiewicz_rybka.xml', { + 'publisher': u'Fundacja Nowoczesna Polska', + 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Mickiewicz/Ballady/Rybka', + 'source_name': u'Mickiewicz, Adam (1798-1855), Poezje, tom 1 (Wiersze młodzieńcze - Ballady i romanse - Wiersze do r. 1824), Krakowska Spółdzielnia Wydawnicza, wyd. 2 zwiększone, Kraków, 1922', + 'author': u'Mickiewicz, Adam', + 'url': u'http://wolnelektury.pl/katalog/lektura/ballady-i-romanse-rybka', + 'created_at': u'2007-09-06', + 'title': u'Rybka', + 'kind': u'Liryka', + 'source_url': u'http://www.polona.pl/Content/2222', + 'released_to_public_domain_at': u'1855-01-01', + 'epoch': u'Romantyzm', + 'genre': u'Ballada', + 'technical_editor': u'Sutkowska, Olga', + 'license_description': u'Domena publiczna - Adam Mickiewicz zm. 1855', + }), + ('dcparser', 'sofokles_antygona.xml', { + 'publisher': u'Fundacja Nowoczesna Polska', + 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Sofokles/Antygona', + 'source_name': u'Sofokles (496-406 a.C.), Antygona, Zakład Narodowy im. Ossolińskich, wyd. 7, Lwów, 1939', + 'author': u'Sofokles', + 'url': u'http://wolnelektury.pl/katalog/lektura/antygona', + 'created_at': u'2007-08-30', + 'title': u'Antygona', + 'kind': u'Dramat', + 'source_url': u'http://www.polona.pl/Content/3768', + 'translator': u'Morawski, Kazimierz', + 'released_to_public_domain_at': u'1925-01-01', + 'epoch': u'Starożytność', + 'genre': u'Tragedia', + 'technical_editor': u'Gałecki, Dariusz', + 'license_description': u'Domena publiczna - tłumacz Kazimierz Morawski zm. 1925', + }), + ('dcparser', 'biedrzycki_akslop.xml', { + 'publisher': u'Fundacja Nowoczesna Polska', + 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Biedrzycki/Akslop', + 'source_name': u'Miłosz Biedrzycki, * ("Gwiazdka"), Fundacja "brulion", Kraków-Warszawa, 1993', + 'author': u'Biedrzycki, Miłosz', + 'url': u'http://wolnelektury.pl/katalog/lektura/akslop', + 'created_at': u'2009-06-04', + 'title': u'Akslop', + 'kind': u'Liryka', + 'source_url': u'http://free.art.pl/mlb/gwiazdka.html#t1', + 'epoch': u'Współczesność', + 'genre': u'Wiersz', + 'technical_editor': u'Sutkowska, Olga', + 'license': u'http://creativecommons.org/licenses/by-sa/3.0/', + 'license_description': u'Creative Commons Uznanie Autorstwa - Na Tych Samych Warunkach 3.0.PL' + }), + ) + + def test_parse(self): + for dir_name, file_name, result in self.KNOWN_RESULTS: + self.assertEqual(dcparser.parse(test_file_path(dir_name, file_name)).to_dict(), result) + + +class TestParserErrors(unittest.TestCase): + def test_error(self): + try: + html.transform(test_file_path('erroneous', 'asnyk_miedzy_nami.xml'), + test_file_path('erroneous', 'asnyk_miedzy_nami.html')) + self.fail() + except etree.XMLSyntaxError, e: + self.assertEqual(e.position, (25, 13)) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/librarian/tests/files/dcparser/.DS_Store b/librarian/tests/files/dcparser/.DS_Store new file mode 100644 index 0000000..8817fe6 Binary files /dev/null and b/librarian/tests/files/dcparser/.DS_Store differ diff --git a/librarian/tests/files/dcparser/andersen_brzydkie_kaczatko.xml b/librarian/tests/files/dcparser/andersen_brzydkie_kaczatko.xml new file mode 100644 index 0000000..d653a9b --- /dev/null +++ b/librarian/tests/files/dcparser/andersen_brzydkie_kaczatko.xml @@ -0,0 +1,24 @@ + + + Andersen, Hans Christian + Brzydkie kaczątko + Niewiadomska, Cecylia + Gałecki, Dariusz + Fundacja Nowoczesna Polska + Romantyzm + Epika + Baśń + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/lektura/brzydkie-kaczatko + http://www.polona.pl/dlibra/doccontent2?id=3563&dirids=4 + Andersen, Hans Christian (1805-1875), Baśnie, Gebethner i Wolff, wyd. 7, Kraków, 1925 + Domena publiczna - tłumacz Cecylia Niewiadomska zm. 1925 + 1925 + xml + text + text + 2007-08-14 + SP1 + pol + + \ No newline at end of file diff --git a/librarian/tests/files/dcparser/biedrzycki_akslop.xml b/librarian/tests/files/dcparser/biedrzycki_akslop.xml new file mode 100644 index 0000000..da0cd9f --- /dev/null +++ b/librarian/tests/files/dcparser/biedrzycki_akslop.xml @@ -0,0 +1,25 @@ + + + Biedrzycki, Miłosz + Akslop + Sekuła, Aleksandra + Sutkowska, Olga + Fundacja Nowoczesna Polska + Współczesność + Liryka + Wiersz + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). + http://wolnelektury.pl/katalog/lektura/akslop + http://free.art.pl/mlb/gwiazdka.html#t1 + Miłosz Biedrzycki, * ("Gwiazdka"), Fundacja "brulion", Kraków-Warszawa, 1993 + Creative Commons Uznanie Autorstwa - Na Tych Samych Warunkach 3.0.PL + http://creativecommons.org/licenses/by-sa/3.0/ + xml + text + text + 2009-06-04 + L + pol + + \ No newline at end of file diff --git a/librarian/tests/files/dcparser/kochanowski_piesn7.xml b/librarian/tests/files/dcparser/kochanowski_piesn7.xml new file mode 100644 index 0000000..96be1ae --- /dev/null +++ b/librarian/tests/files/dcparser/kochanowski_piesn7.xml @@ -0,0 +1,27 @@ + + + Kochanowski, Jan + Pieśń VII (Trudna rada w tej mierze: przyjdzie się rozjechać...) + http://www.wolnelektury.pl/lektura/piesni-ksiegi-pierwsze + Sekuła, Aleksandra + Krzyżanowski, Julian + Otwinowska, Barbara + Gałecki, Dariusz + Fundacja Nowoczesna Polska + Renesans + Liryka + Pieśń + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/lektura/piesni-ksiegi-pierwsze-piesn-vii-trudna-rada-w-tej-mierze-pr + http://www.polona.pl/Content/1499 + Kochanowski, Jan (1530-1584), Dzieła polskie, tom 1, oprac. Julian Krzyżanowski, wyd. 8, Państwowy Instytut Wydawniczy, Warszawa, 1976 + Domena publiczna - Jan Kochanowski zm. 1584 + 1584 + xml + text + text + 2007-08-31 + L + pol + + \ No newline at end of file diff --git a/librarian/tests/files/dcparser/mickiewicz_rybka.xml b/librarian/tests/files/dcparser/mickiewicz_rybka.xml new file mode 100644 index 0000000..0796a5b --- /dev/null +++ b/librarian/tests/files/dcparser/mickiewicz_rybka.xml @@ -0,0 +1,28 @@ + + + Mickiewicz, Adam + Rybka + http://www.wolnelektury.pl/lektura/ballady-i-romanse + Sekuła, Aleksandra + Kallenbach, Józef + Sutkowska, Olga + Fundacja Nowoczesna Polska + Romantyzm + Liryka + Ballada + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/lektura/ballady-i-romanse-rybka + http://www.polona.pl/Content/2222 + Mickiewicz, Adam (1798-1855), Poezje, tom 1 (Wiersze młodzieńcze - Ballady i romanse - Wiersze do r. 1824), Krakowska Spółdzielnia Wydawnicza, wyd. 2 zwiększone, Kraków, 1922 + Domena publiczna - Adam Mickiewicz zm. 1855 + 1855 + xml + text + text + 2007-09-06 + SP2 + G + L + pol + + \ No newline at end of file diff --git a/librarian/tests/files/dcparser/sofokles_antygona.xml b/librarian/tests/files/dcparser/sofokles_antygona.xml new file mode 100644 index 0000000..4acb2d4 --- /dev/null +++ b/librarian/tests/files/dcparser/sofokles_antygona.xml @@ -0,0 +1,25 @@ + + + Sofokles + Antygona + Sekuła, Aleksandra + Morawski, Kazimierz + Gałecki, Dariusz + Fundacja Nowoczesna Polska + Starożytność + Dramat + Tragedia + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/lektura/antygona + http://www.polona.pl/Content/3768 + Sofokles (496-406 a.C.), Antygona, Zakład Narodowy im. Ossolińskich, wyd. 7, Lwów, 1939 + Domena publiczna - tłumacz Kazimierz Morawski zm. 1925 + 1925 + xml + text + text + 2007-08-30 + G + pol + + \ No newline at end of file diff --git a/librarian/tests/files/erroneous/asnyk_miedzy_nami.html b/librarian/tests/files/erroneous/asnyk_miedzy_nami.html new file mode 100644 index 0000000..1d7e17f --- /dev/null +++ b/librarian/tests/files/erroneous/asnyk_miedzy_nami.html @@ -0,0 +1,46 @@ +
    +
    +

    Spis treści

    +
      +
    +

    + Adam Asnyk + Między nami nic nie było +

    +
    +

    1Między nami nic nie było!

    +

    + Żadnych zwierzeń, wyznań żadnych!

    +

    + Nic nas z sobą nie łączyło —

    +

    + Prócz wiosennych marzeń zdradnych;

    +
    +
    +

    5Prócz tych woni, barw i blasków,

    +

    + Unoszących się w przestrzeni;

    +

    + Prócz szumiących śpiewem lasków

    +

    + I tej świeżej łąk zieleni;

    +
    +
    +

    Prócz tych kaskad i potoków,

    +

    10 + Zraszających każdy parów,

    +

    + Prócz girlandy tęcz, obłoków,

    +

    + Prócz natury słodkich czarów;

    +
    +
    +

    Prócz tych wspólnych, jasnych zdrojów,

    +

    + Z których serce zachwyt piło;

    +

    15 + Prócz pierwiosnków i powojów,—

    +

    + Między nami nic nie było!

    +
    +
    diff --git a/librarian/tests/files/erroneous/asnyk_miedzy_nami.xml b/librarian/tests/files/erroneous/asnyk_miedzy_nami.xml new file mode 100644 index 0000000..aa5ef17 --- /dev/null +++ b/librarian/tests/files/erroneous/asnyk_miedzy_nami.xml @@ -0,0 +1,25 @@ + + + Adam Asnyk + Między nami nic nie było + + Między nami nic nie było!/ + Żadnych zwierzeń, wyznań żadnych!/ + Nic nas z sobą nie łączyło ---/ + Prócz wiosennych marzeń zdradnych; + + Prócz tych woni, barw i blasków,/ + Unoszących się w przestrzeni;/ + Prócz szumiących śpiewem lasków/ + I tej świeżej łąk zieleni; + + Prócz tych kaskad i potoków,/ + Zraszających każdy parów,/ + Prócz girlandy tęcz, obłoków,/ + Prócz natury słodkich czarów; + + Prócz tych wspólnych, jasnych zdrojów,/ + Z których serce zachwyt piło;/ + Prócz pierwiosnków i powojów,---/ + Między nami nic nie było! + diff --git a/librarian/text.py b/librarian/text.py new file mode 100644 index 0000000..db0d2b2 --- /dev/null +++ b/librarian/text.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +import os +import cStringIO +import re +import codecs + +from lxml import etree + +from librarian import dcparser + + +ENTITY_SUBSTITUTIONS = [ + (u'---', u'—'), + (u'--', u'–'), + (u'...', u'…'), + (u',,', u'„'), + (u'"', u'”'), +] + + +MAX_LINE_LENGTH = 80 + + +def strip(context, text): + """Remove unneeded whitespace from beginning and end""" + if isinstance(text, list): + text = ''.join(text) + return re.sub(r'\s+', ' ', text).strip() + + +def substitute_entities(context, text): + """XPath extension function converting all entites in passed text.""" + if isinstance(text, list): + text = ''.join(text) + for entity, substitutution in ENTITY_SUBSTITUTIONS: + text = text.replace(entity, substitutution) + return text + + +def wrap_words(context, text): + """XPath extension function automatically wrapping words in passed text""" + if isinstance(text, list): + text = ''.join(text) + words = re.split(r'\s', text) + + line_length = 0 + lines = [[]] + for word in words: + line_length += len(word) + 1 + if line_length > MAX_LINE_LENGTH: + # Max line length was exceeded. We create new line + lines.append([]) + line_length = len(word) + lines[-1].append(word) + return '\n'.join(' '.join(line) for line in lines) + + +# Register substitute_entities function with lxml +ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') +ns['strip'] = strip +ns['substitute_entities'] = substitute_entities +ns['wrap_words'] = wrap_words + + +def transform(input_filename, output_filename): + """Transforms file input_filename in XML to output_filename in TXT.""" + # Parse XSLT + style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt') + style = etree.parse(style_filename) + + doc_file = cStringIO.StringIO() + expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); + + f = open(input_filename, 'r') + for line in f: + line = line.decode('utf-8') + line = expr.sub(u'
    \n', line) + doc_file.write(line.encode('utf-8')) + f.close() + + doc_file.seek(0) + + parser = etree.XMLParser(remove_blank_text=True) + doc = etree.parse(doc_file, parser) + + result = doc.xslt(style) + output_file = codecs.open(output_filename, 'wb', encoding='utf-8') + output_file.write(unicode(result) % dcparser.parse(input_filename).url) + diff --git a/librarian/text.pyc b/librarian/text.pyc new file mode 100644 index 0000000..6c6eb91 Binary files /dev/null and b/librarian/text.pyc differ diff --git a/scripts/book2html b/scripts/book2html new file mode 100755 index 0000000..a0229bb --- /dev/null +++ b/scripts/book2html @@ -0,0 +1,31 @@ +#!/usr/bin/env python +import os +import optparse + +from librarian import html + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Convert SOURCE files to HTML format.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.html' + html.transform(input_filename, output_filename) + diff --git a/scripts/book2txt b/scripts/book2txt new file mode 100755 index 0000000..1ca4623 --- /dev/null +++ b/scripts/book2txt @@ -0,0 +1,31 @@ +#!/usr/bin/env python +import os +import optparse + +from librarian import text + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Convert SOURCE files to TXT format.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.txt' + text.transform(input_filename, output_filename) + diff --git a/scripts/bookfragments b/scripts/bookfragments new file mode 100755 index 0000000..f29e11e --- /dev/null +++ b/scripts/bookfragments @@ -0,0 +1,50 @@ +#!/usr/bin/env python +import os +import optparse + +from librarian import html + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Extract theme fragments from SOURCE.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' + + closed_fragments, open_fragments = html.extract_fragments(input_filename) + + for fragment_id in open_fragments: + print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) + + output_file = open(output_filename, 'w') + output_file.write(""" + + + bookfragments output + + + + """) + for fragment in closed_fragments.values(): + fragment_html = u'

    [#%s] %s

    %s
    ' % (fragment.id, fragment.themes, fragment) + output_file.write(fragment_html.encode('utf-8')) + output_file.write('') + output_file.close() + diff --git a/scripts/genslugs b/scripts/genslugs new file mode 100755 index 0000000..3391d8e --- /dev/null +++ b/scripts/genslugs @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import optparse + +from lxml import etree +from librarian import html +from slughifi import slughifi + + +BOOK_URL = 'http://wolnelektury.pl/katalog/lektura/' + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Generate slugs for SOURCE.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + parser.add_option('-f', '--force', action='store_true', dest='force', default=False, + help='overwrite current identifiers') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + doc = etree.parse(input_filename) + try: + title = doc.find('//{http://purl.org/dc/elements/1.1/}title').text + except AttributeError: + print '%s:error:Book title not found. Skipping.' % input_filename + continue + + parent = '' + try: + parent_url = doc.find('//{http://purl.org/dc/elements/1.1/}relation.isPartOf').text + parent = parent_url.rsplit('/', 1)[1] + ' ' + except AttributeError: + pass + except IndexError: + print '%s:error:Invalid parent URL "%s". Skipping.' % (input_filename, parent_url) + + book_url = doc.find('//{http://purl.org/dc/elements/1.1/}identifier.url') + if book_url is None: + book_description = doc.find('//{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description') + book_url = etree.SubElement(book_description, '{http://purl.org/dc/elements/1.1/}identifier.url') + if not options.force and book_url.text.startswith('http://'): + print '%s:Notice:Book already has identifier URL "%s". Skipping.' % (input_filename, book_url.text) + continue + + book_url.text = BOOK_URL + slughifi(parent + title)[:60] + + doc.write(input_filename, xml_declaration=True, pretty_print=True, encoding='utf-8') + diff --git a/scripts/master.css b/scripts/master.css new file mode 100644 index 0000000..98e142b --- /dev/null +++ b/scripts/master.css @@ -0,0 +1,207 @@ +body { + font-size: 16px; + font: Georgia, "Times New Roman", serif; + line-height: 1.5em; + margin: 3em; + max-width: 36em; +} + +a { + color: blue; + text-decoration: none; +} + +/* =================================================== */ +/* = Common elements: headings, paragraphs and lines = */ +/* =================================================== */ +h1 { + font-size: 3em; + margin: 1.5em 0; + text-align: center; + line-height: 1.5em; + font-weight: bold; +} + +h2 { + font-size: 2em; + margin: 1.5em 0 0; + font-weight: bold; + line-height: 1.5em; +} + +h3 { + font-size: 1.5em; + margin: 1.5em 0 0; + font-weight: normal; + line-height: 1.5em; +} + +h4 { + font-size: 1em; + margin: 1.5em 0 0; + line-height: 1.5em; +} + +p { + margin: 0; +} + +/* ======================== */ +/* = Footnotes and themes = */ +/* ======================== */ +.theme-begin { + border-left: 0.1em solid #DDDDDD; + color: #777; + padding: 0 0.5em; + width: 7.5em; + font-style: normal; + font-weight: normal; + font-size: 16px; + position: absolute; + left: 40em; + line-height: 1.5em; + text-align: left; +} + +.annotation { + font-style: normal; + font-weight: normal; + font-size: 12px; +} + +#footnotes .annotation { + display: block; + float: left; + width: 2.5em; + clear: both; +} + +#footnotes div { + margin: 1.5em 0 0 0; +} + +#footnotes p { + margin-left: 2.5em; +} + + +/* ============= */ +/* = Numbering = */ +/* ============= */ +.anchor { + float: left; + margin: -0.2em -0.5em -0.2em -3.5em; + color: #777; + font-size: 12px; + width: 2em; + text-align: center; + padding: 0.2em 0.5em; +} + +.anchor:hover, .anchor:active { + color: #FFF; + background-color: #CCC; +} + + +/* =================== */ +/* = Custom elements = */ +/* =================== */ +span.author { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: 0.25em; +} + +span.collection { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: -0.25em; +} + +span.subtitle { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-top: -0.25em; +} + +div.didaskalia { + font-style: italic; + margin: 0.5em 0 0; +} + +div.kwestia { + margin: 0.5em 0 0; +} + +div.stanza { + margin: 1.5em 0 0; +} + +div.kwestia div.stanza { + margin: 0; +} + +p.paragraph { + text-align: justify; + margin: 1.5em 0 0; +} + +p.motto { + text-align: justify; + font-style: italic; + margin: 1.5em 0 0; +} + +p.motto_podpis { + font-size: 0.875em; +} + +div.fragment { + border-bottom: 0.1em solid #999; + padding-bottom: 1.5em; +} + +div.note p, div.dedication p, div.note p.paragraph, div.dedication p.paragraph { + text-align: right; + font-style: italic; +} + +hr.spacer { + height: 3em; + visibility: hidden; +} + +hr.spacer-line { + margin: 1.5em 0; + border: none; + border-bottom: 0.1em solid #000; +} + +p.spacer-asterisk { + padding: 0; + margin: 1.5em 0; + text-align: center; +} + +div.person-list ol { + list-style: none; + padding: 0 0 0 1.5em; +} + +p.place-and-time { + font-style: italic; +} + +em.math, em.foreign-word, em.book-title, em.didaskalia, em.author-emphasis { + font-style: italic; +} + +em.person { + font-style: normal; + font-variant: small-caps; +} + diff --git a/scripts/master.plain.css b/scripts/master.plain.css new file mode 100644 index 0000000..3210e88 --- /dev/null +++ b/scripts/master.plain.css @@ -0,0 +1,160 @@ +body { + font-size: 16px; + font: Georgia, "Times New Roman", serif; + line-height: 1.5em; + margin: 3em; + max-width: 36em; +} + +a { + color: blue; + text-decoration: none; +} + +/* =================================================== */ +/* = Common elements: headings, paragraphs and lines = */ +/* =================================================== */ +h1 { + font-size: 3em; + margin: 1.5em 0; + text-align: center; + line-height: 1.5em; + font-weight: bold; +} + +h2 { + font-size: 2em; + margin: 1.5em 0 0; + font-weight: bold; + line-height: 1.5em; +} + +h3 { + font-size: 1.5em; + margin: 1.5em 0 0; + font-weight: normal; + line-height: 1.5em; +} + +h4 { + font-size: 1em; + margin: 1.5em 0 0; + line-height: 1.5em; +} + +p { + margin: 0; +} + +/* ======================== */ +/* = Footnotes and themes = */ +/* ======================== */ +.theme-begin { + border-left: 0.1em solid #DDDDDD; + color: #666; + float: right; + margin: 0 -9.5em 0 0; + padding: 0 0.5em; + width: 7.5em; + font-style: normal; + font-weight: normal; + font-size: 16px; + display: none; +} + +.annotation { + font-style: normal; + font-weight: normal; + font-size: 16px; + display: none; +} + +#footnotes { + display: none; +} + +#footnotes .annotation { + display: block; + float: left; + width: 2.5em; + clear: both; +} + +#footnotes div { + margin: 1.5em 0 0 0; +} + +#footnotes p { + margin-left: 2.5em; +} + +/* =================== */ +/* = Custom elements = */ +/* =================== */ +span.author { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: 0.25em; +} + +span.collection { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: -0.25em; +} + +span.subtitle { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-top: -0.25em; +} + +div.didaskalia { + font-style: italic; + margin: 0.5em 0 0; +} + +div.kwestia { + margin: 0.5em 0 0; +} + +div.stanza { + margin: 1.5em 0 0; +} + +div.kwestia div.stanza { + margin: 0; +} + +p.paragraph { + text-align: justify; + margin: 1.5em 0 0; +} + +p.motto { + text-align: justify; + font-style: italic; + margin: 1.5em 0 0; +} + +p.motto_podpis { + font-size: 0.875em; +} + +div.fragment { + border-bottom: 0.1em solid #999; + padding-bottom: 1.5em; +} + +div.note p, div.note p.paragraph { + text-align: right; + font-style: italic; +} + +hr.spacer { + height: 3em; + visibility: hidden; +} diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0988321 --- /dev/null +++ b/setup.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +from distutils.core import setup + + +setup( + name='librarian', + version='1.1', + description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', + author='Marek Stępniowski', + author_email='marek@stepniowski.com', + url='http://redmine.nowoczesnapolska.org.pl/', + packages=['librarian', 'librarian.tests'], + package_dir={'librarian': 'librarian'}, + package_data={ + 'librarian': ['*.xslt'], + 'librarian.tests': ['files/dcparser/*.xml', 'files/erroneous/*.xml'], + }, + scripts=['scripts/book2html', 'scripts/book2txt', 'scripts/bookfragments', 'scripts/genslugs'], +)