From af98f6d6812632c79bfb73139da7a6686abd0dbb Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20St=C4=99pniowski?= Date: Wed, 12 Aug 2009 13:44:04 +0200 Subject: [PATCH] =?utf8?q?Usuni=C4=99cie=20biblioteki=20librarian,=20kt?= =?utf8?q?=C3=B3ra=20teraz=20rozwijana=20jest=20w=20innym=20projekcie.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- README | 1 + lib/librarian/__init__.py | 0 lib/librarian/bin/book2html.py | 31 - lib/librarian/bin/book2txt.py | 31 - lib/librarian/bin/bookfragments.py | 50 -- lib/librarian/bin/genslugs.py | 64 -- lib/librarian/bin/master.css | 207 ------ lib/librarian/bin/master.plain.css | 160 ----- lib/librarian/book2html.xslt | 615 ------------------ lib/librarian/book2txt.xslt | 308 --------- lib/librarian/dcparser.py | 197 ------ lib/librarian/html.py | 242 ------- lib/librarian/tests/__init__.py | 115 ---- .../dcparser/andersen_brzydkie_kaczatko.xml | 24 - .../files/dcparser/biedrzycki_akslop.xml | 25 - .../files/dcparser/kochanowski_piesn7.xml | 27 - .../tests/files/dcparser/mickiewicz_rybka.xml | 28 - .../files/dcparser/sofokles_antygona.xml | 25 - .../files/erroneous/asnyk_miedzy_nami.html | 46 -- .../files/erroneous/asnyk_miedzy_nami.xml | 25 - lib/librarian/text.py | 81 --- 21 files changed, 1 insertion(+), 2301 deletions(-) delete mode 100644 lib/librarian/__init__.py delete mode 100755 lib/librarian/bin/book2html.py delete mode 100755 lib/librarian/bin/book2txt.py delete mode 100755 lib/librarian/bin/bookfragments.py delete mode 100755 lib/librarian/bin/genslugs.py delete mode 100644 lib/librarian/bin/master.css delete mode 100644 lib/librarian/bin/master.plain.css delete mode 100644 lib/librarian/book2html.xslt delete mode 100644 lib/librarian/book2txt.xslt delete mode 100644 lib/librarian/dcparser.py delete mode 100644 lib/librarian/html.py delete mode 100644 lib/librarian/tests/__init__.py delete mode 100644 lib/librarian/tests/files/dcparser/andersen_brzydkie_kaczatko.xml delete mode 100644 lib/librarian/tests/files/dcparser/biedrzycki_akslop.xml delete mode 100644 lib/librarian/tests/files/dcparser/kochanowski_piesn7.xml delete mode 100644 lib/librarian/tests/files/dcparser/mickiewicz_rybka.xml delete mode 100644 lib/librarian/tests/files/dcparser/sofokles_antygona.xml delete mode 100644 lib/librarian/tests/files/erroneous/asnyk_miedzy_nami.html delete mode 100644 lib/librarian/tests/files/erroneous/asnyk_miedzy_nami.xml delete mode 100644 lib/librarian/text.py diff --git a/README b/README index 1985a8b86..fa3064e1c 100644 --- a/README +++ b/README @@ -3,6 +3,7 @@ Dependencies * [Django 1.0](http://djangoproject.com/) (application framework) * [lxml 2.0.0](http://codespeak.net/lxml/) (for importing books) + * [librarian 1.1](http://redmine.nowoczesnapolska.org.pl/projects/show/librarian) (for importing books) * Python libraries from lib directory * Django applications from apps directory diff --git a/lib/librarian/__init__.py b/lib/librarian/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lib/librarian/bin/book2html.py b/lib/librarian/bin/book2html.py deleted file mode 100755 index a0229bbe7..000000000 --- a/lib/librarian/bin/book2html.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -import os -import optparse - -from librarian import html - - -if __name__ == '__main__': - # Parse commandline arguments - usage = """Usage: %prog [options] SOURCE [SOURCE...] - Convert SOURCE files to HTML format.""" - - parser = optparse.OptionParser(usage=usage) - - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - - options, input_filenames = parser.parse_args() - - if len(input_filenames) < 1: - parser.print_help() - exit(1) - - # Do some real work - for input_filename in input_filenames: - if options.verbose: - print input_filename - - output_filename = os.path.splitext(input_filename)[0] + '.html' - html.transform(input_filename, output_filename) - diff --git a/lib/librarian/bin/book2txt.py b/lib/librarian/bin/book2txt.py deleted file mode 100755 index 1ca4623fd..000000000 --- a/lib/librarian/bin/book2txt.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -import os -import optparse - -from librarian import text - - -if __name__ == '__main__': - # Parse commandline arguments - usage = """Usage: %prog [options] SOURCE [SOURCE...] - Convert SOURCE files to TXT format.""" - - parser = optparse.OptionParser(usage=usage) - - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - - options, input_filenames = parser.parse_args() - - if len(input_filenames) < 1: - parser.print_help() - exit(1) - - # Do some real work - for input_filename in input_filenames: - if options.verbose: - print input_filename - - output_filename = os.path.splitext(input_filename)[0] + '.txt' - text.transform(input_filename, output_filename) - diff --git a/lib/librarian/bin/bookfragments.py b/lib/librarian/bin/bookfragments.py deleted file mode 100755 index f29e11e02..000000000 --- a/lib/librarian/bin/bookfragments.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python -import os -import optparse - -from librarian import html - - -if __name__ == '__main__': - # Parse commandline arguments - usage = """Usage: %prog [options] SOURCE [SOURCE...] - Extract theme fragments from SOURCE.""" - - parser = optparse.OptionParser(usage=usage) - - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - - options, input_filenames = parser.parse_args() - - if len(input_filenames) < 1: - parser.print_help() - exit(1) - - # Do some real work - for input_filename in input_filenames: - if options.verbose: - print input_filename - - output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' - - closed_fragments, open_fragments = html.extract_fragments(input_filename) - - for fragment_id in open_fragments: - print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) - - output_file = open(output_filename, 'w') - output_file.write(""" - - - bookfragments output - - - - """) - for fragment in closed_fragments.values(): - fragment_html = u'

[#%s] %s

%s
' % (fragment.id, fragment.themes, fragment) - output_file.write(fragment_html.encode('utf-8')) - output_file.write('') - output_file.close() - diff --git a/lib/librarian/bin/genslugs.py b/lib/librarian/bin/genslugs.py deleted file mode 100755 index 3391d8e52..000000000 --- a/lib/librarian/bin/genslugs.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import os -import optparse - -from lxml import etree -from librarian import html -from slughifi import slughifi - - -BOOK_URL = 'http://wolnelektury.pl/katalog/lektura/' - - -if __name__ == '__main__': - # Parse commandline arguments - usage = """Usage: %prog [options] SOURCE [SOURCE...] - Generate slugs for SOURCE.""" - - parser = optparse.OptionParser(usage=usage) - - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - parser.add_option('-f', '--force', action='store_true', dest='force', default=False, - help='overwrite current identifiers') - - options, input_filenames = parser.parse_args() - - if len(input_filenames) < 1: - parser.print_help() - exit(1) - - # Do some real work - for input_filename in input_filenames: - if options.verbose: - print input_filename - - doc = etree.parse(input_filename) - try: - title = doc.find('//{http://purl.org/dc/elements/1.1/}title').text - except AttributeError: - print '%s:error:Book title not found. Skipping.' % input_filename - continue - - parent = '' - try: - parent_url = doc.find('//{http://purl.org/dc/elements/1.1/}relation.isPartOf').text - parent = parent_url.rsplit('/', 1)[1] + ' ' - except AttributeError: - pass - except IndexError: - print '%s:error:Invalid parent URL "%s". Skipping.' % (input_filename, parent_url) - - book_url = doc.find('//{http://purl.org/dc/elements/1.1/}identifier.url') - if book_url is None: - book_description = doc.find('//{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description') - book_url = etree.SubElement(book_description, '{http://purl.org/dc/elements/1.1/}identifier.url') - if not options.force and book_url.text.startswith('http://'): - print '%s:Notice:Book already has identifier URL "%s". Skipping.' % (input_filename, book_url.text) - continue - - book_url.text = BOOK_URL + slughifi(parent + title)[:60] - - doc.write(input_filename, xml_declaration=True, pretty_print=True, encoding='utf-8') - diff --git a/lib/librarian/bin/master.css b/lib/librarian/bin/master.css deleted file mode 100644 index 98e142b9f..000000000 --- a/lib/librarian/bin/master.css +++ /dev/null @@ -1,207 +0,0 @@ -body { - font-size: 16px; - font: Georgia, "Times New Roman", serif; - line-height: 1.5em; - margin: 3em; - max-width: 36em; -} - -a { - color: blue; - text-decoration: none; -} - -/* =================================================== */ -/* = Common elements: headings, paragraphs and lines = */ -/* =================================================== */ -h1 { - font-size: 3em; - margin: 1.5em 0; - text-align: center; - line-height: 1.5em; - font-weight: bold; -} - -h2 { - font-size: 2em; - margin: 1.5em 0 0; - font-weight: bold; - line-height: 1.5em; -} - -h3 { - font-size: 1.5em; - margin: 1.5em 0 0; - font-weight: normal; - line-height: 1.5em; -} - -h4 { - font-size: 1em; - margin: 1.5em 0 0; - line-height: 1.5em; -} - -p { - margin: 0; -} - -/* ======================== */ -/* = Footnotes and themes = */ -/* ======================== */ -.theme-begin { - border-left: 0.1em solid #DDDDDD; - color: #777; - padding: 0 0.5em; - width: 7.5em; - font-style: normal; - font-weight: normal; - font-size: 16px; - position: absolute; - left: 40em; - line-height: 1.5em; - text-align: left; -} - -.annotation { - font-style: normal; - font-weight: normal; - font-size: 12px; -} - -#footnotes .annotation { - display: block; - float: left; - width: 2.5em; - clear: both; -} - -#footnotes div { - margin: 1.5em 0 0 0; -} - -#footnotes p { - margin-left: 2.5em; -} - - -/* ============= */ -/* = Numbering = */ -/* ============= */ -.anchor { - float: left; - margin: -0.2em -0.5em -0.2em -3.5em; - color: #777; - font-size: 12px; - width: 2em; - text-align: center; - padding: 0.2em 0.5em; -} - -.anchor:hover, .anchor:active { - color: #FFF; - background-color: #CCC; -} - - -/* =================== */ -/* = Custom elements = */ -/* =================== */ -span.author { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-bottom: 0.25em; -} - -span.collection { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-bottom: -0.25em; -} - -span.subtitle { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-top: -0.25em; -} - -div.didaskalia { - font-style: italic; - margin: 0.5em 0 0; -} - -div.kwestia { - margin: 0.5em 0 0; -} - -div.stanza { - margin: 1.5em 0 0; -} - -div.kwestia div.stanza { - margin: 0; -} - -p.paragraph { - text-align: justify; - margin: 1.5em 0 0; -} - -p.motto { - text-align: justify; - font-style: italic; - margin: 1.5em 0 0; -} - -p.motto_podpis { - font-size: 0.875em; -} - -div.fragment { - border-bottom: 0.1em solid #999; - padding-bottom: 1.5em; -} - -div.note p, div.dedication p, div.note p.paragraph, div.dedication p.paragraph { - text-align: right; - font-style: italic; -} - -hr.spacer { - height: 3em; - visibility: hidden; -} - -hr.spacer-line { - margin: 1.5em 0; - border: none; - border-bottom: 0.1em solid #000; -} - -p.spacer-asterisk { - padding: 0; - margin: 1.5em 0; - text-align: center; -} - -div.person-list ol { - list-style: none; - padding: 0 0 0 1.5em; -} - -p.place-and-time { - font-style: italic; -} - -em.math, em.foreign-word, em.book-title, em.didaskalia, em.author-emphasis { - font-style: italic; -} - -em.person { - font-style: normal; - font-variant: small-caps; -} - diff --git a/lib/librarian/bin/master.plain.css b/lib/librarian/bin/master.plain.css deleted file mode 100644 index 3210e8819..000000000 --- a/lib/librarian/bin/master.plain.css +++ /dev/null @@ -1,160 +0,0 @@ -body { - font-size: 16px; - font: Georgia, "Times New Roman", serif; - line-height: 1.5em; - margin: 3em; - max-width: 36em; -} - -a { - color: blue; - text-decoration: none; -} - -/* =================================================== */ -/* = Common elements: headings, paragraphs and lines = */ -/* =================================================== */ -h1 { - font-size: 3em; - margin: 1.5em 0; - text-align: center; - line-height: 1.5em; - font-weight: bold; -} - -h2 { - font-size: 2em; - margin: 1.5em 0 0; - font-weight: bold; - line-height: 1.5em; -} - -h3 { - font-size: 1.5em; - margin: 1.5em 0 0; - font-weight: normal; - line-height: 1.5em; -} - -h4 { - font-size: 1em; - margin: 1.5em 0 0; - line-height: 1.5em; -} - -p { - margin: 0; -} - -/* ======================== */ -/* = Footnotes and themes = */ -/* ======================== */ -.theme-begin { - border-left: 0.1em solid #DDDDDD; - color: #666; - float: right; - margin: 0 -9.5em 0 0; - padding: 0 0.5em; - width: 7.5em; - font-style: normal; - font-weight: normal; - font-size: 16px; - display: none; -} - -.annotation { - font-style: normal; - font-weight: normal; - font-size: 16px; - display: none; -} - -#footnotes { - display: none; -} - -#footnotes .annotation { - display: block; - float: left; - width: 2.5em; - clear: both; -} - -#footnotes div { - margin: 1.5em 0 0 0; -} - -#footnotes p { - margin-left: 2.5em; -} - -/* =================== */ -/* = Custom elements = */ -/* =================== */ -span.author { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-bottom: 0.25em; -} - -span.collection { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-bottom: -0.25em; -} - -span.subtitle { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-top: -0.25em; -} - -div.didaskalia { - font-style: italic; - margin: 0.5em 0 0; -} - -div.kwestia { - margin: 0.5em 0 0; -} - -div.stanza { - margin: 1.5em 0 0; -} - -div.kwestia div.stanza { - margin: 0; -} - -p.paragraph { - text-align: justify; - margin: 1.5em 0 0; -} - -p.motto { - text-align: justify; - font-style: italic; - margin: 1.5em 0 0; -} - -p.motto_podpis { - font-size: 0.875em; -} - -div.fragment { - border-bottom: 0.1em solid #999; - padding-bottom: 1.5em; -} - -div.note p, div.note p.paragraph { - text-align: right; - font-style: italic; -} - -hr.spacer { - height: 3em; - visibility: hidden; -} diff --git a/lib/librarian/book2html.xslt b/lib/librarian/book2html.xslt deleted file mode 100644 index 71f118209..000000000 --- a/lib/librarian/book2html.xslt +++ /dev/null @@ -1,615 +0,0 @@ - - - - - - - -
- - -
-

Przypisy

- -
- - [] - - -

-
- - - -
-
-
-
-
-
- -
- - - - - - - - -

- -

-
- -
- - - - - - - -
-
- - -
-

-
    - -
-
-
- - -
-
- - -
- -
-
- - -
-
- - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - -

-
- - -

-
- - -

-
- - - -

-
- - -
-
- - -
  • -
    - - -

    -
    - - -
    - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -

    - - - padding-left: 1em - - - - - padding-left: em - - - padding-left: 1em - - - - - padding-left: 12em - - - -

    -
    - - -

    -
    - - - - - - - - - - [] - - - - - - - - - - - - - - - - - - „” - - - - - - - - - - - - - - - - - -
    -
    - - -

    *

    -
    - - -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - diff --git a/lib/librarian/book2txt.xslt b/lib/librarian/book2txt.xslt deleted file mode 100644 index d3658a3f7..000000000 --- a/lib/librarian/book2txt.xslt +++ /dev/null @@ -1,308 +0,0 @@ - - - - - - - - - -Kodowanie znaków w dokumencie: UTF-8. ------ -Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez -Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje -się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać. - -Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %s. ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -/ / - - - - - * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -„” - - - -** - - - - - - - - - - - - - - - - - - - - - - -* - - - - - - - ------------------------------------------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/lib/librarian/dcparser.py b/lib/librarian/dcparser.py deleted file mode 100644 index 557509c95..000000000 --- a/lib/librarian/dcparser.py +++ /dev/null @@ -1,197 +0,0 @@ -# -*- coding: utf-8 -*- -from xml.parsers.expat import ExpatError -from datetime import date -import time - -# Import ElementTree from anywhere -try: - import xml.etree.ElementTree as etree # Python >= 2.5 -except ImportError: - try: - import elementtree.ElementTree as etree # effbot's pure Python module - except ImportError: - import lxml.etree as etree # ElementTree API using libxml2 - - -# ============== -# = Converters = -# ============== -class Person(object): - """Single person with last name and a list of first names.""" - def __init__(self, last_name, *first_names): - self.last_name = last_name - self.first_names = first_names - - - def __eq__(self, right): - return self.last_name == right.last_name and self.first_names == right.first_names - - - def __unicode__(self): - if len(self.first_names) > 0: - return '%s, %s' % (self.last_name, ' '.join(self.first_names)) - else: - return self.last_name - - - def __repr__(self): - return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names) - - -def str_to_unicode(value, previous): - return unicode(value) - - -def str_to_unicode_list(value, previous): - if previous is None: - previous = [] - previous.append(str_to_unicode(value, None)) - return previous - - -def str_to_person(value, previous): - comma_count = value.count(',') - - if comma_count == 0: - last_name, first_names = value, [] - elif comma_count == 1: - last_name, first_names = value.split(',') - first_names = [name for name in first_names.split(' ') if len(name)] - else: - raise ValueError("value contains more than one comma: %r" % value) - - return Person(last_name.strip(), *first_names) - - -def str_to_date(value, previous): - try: - t = time.strptime(value, '%Y-%m-%d') - except ValueError: - t = time.strptime(value, '%Y') - return date(t[0], t[1], t[2]) - - -# ========== -# = Parser = -# ========== -class ParseError(Exception): - def __init__(self, message): - super(ParseError, self).__init__(message) - - -class XMLNamespace(object): - '''Represents XML namespace.''' - - def __init__(self, uri): - self.uri = uri - - def __call__(self, tag): - return '{%s}%s' % (self.uri, tag) - - def __contains__(self, tag): - return tag.startswith(str(self)) - - def __repr__(self): - return 'XMLNamespace(%r)' % self.uri - - def __str__(self): - return '%s' % self.uri - - -class BookInfo(object): - RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') - DC = XMLNamespace('http://purl.org/dc/elements/1.1/') - - mapping = { - DC('creator') : ('author', str_to_person), - DC('title') : ('title', str_to_unicode), - DC('subject.period') : ('epoch', str_to_unicode), - DC('subject.type') : ('kind', str_to_unicode), - DC('subject.genre') : ('genre', str_to_unicode), - DC('date') : ('created_at', str_to_date), - DC('date.pd') : ('released_to_public_domain_at', str_to_date), - DC('contributor.translator') : ('translator', str_to_person), - DC('contributor.technical_editor') : ('technical_editor', str_to_person), - DC('publisher') : ('publisher', str_to_unicode), - DC('source') : ('source_name', str_to_unicode), - DC('source.URL') : ('source_url', str_to_unicode), - DC('identifier.url') : ('url', str_to_unicode), - DC('relation.hasPart') : ('parts', str_to_unicode_list), - DC('rights.license') : ('license', str_to_unicode), - DC('rights') : ('license_description', str_to_unicode), - } - - @classmethod - def from_string(cls, xml): - from StringIO import StringIO - return cls.from_file(StringIO(xml)) - - @classmethod - def from_file(cls, xml_file): - book_info = cls() - - try: - tree = etree.parse(xml_file) - except ExpatError, e: - raise ParseError(e) - - description = tree.find('//' + book_info.RDF('Description')) - book_info.wiki_url = description.get(cls.RDF('about'), None) - - if description is None: - raise ParseError('no Description tag found in document') - - for element in description.findall('*'): - book_info.parse_element(element) - - return book_info - - def parse_element(self, element): - try: - attribute, converter = self.mapping[element.tag] - setattr(self, attribute, converter(element.text, getattr(self, attribute, None))) - except KeyError: - pass - - def to_xml(self): - """XML representation of this object.""" - etree._namespace_map[str(self.RDF)] = 'rdf' - etree._namespace_map[str(self.DC)] = 'dc' - - root = etree.Element(self.RDF('RDF')) - description = etree.SubElement(root, self.RDF('Description')) - - if self.wiki_url: - description.set(self.RDF('about'), self.wiki_url) - - for tag, (attribute, converter) in self.mapping.iteritems(): - if hasattr(self, attribute): - e = etree.Element(tag) - e.text = unicode(getattr(self, attribute)) - description.append(e) - - return unicode(etree.tostring(root, 'utf-8'), 'utf-8') - - def to_dict(self): - etree._namespace_map[str(self.RDF)] = 'rdf' - etree._namespace_map[str(self.DC)] = 'dc' - - result = {'about': self.wiki_url} - for tag, (attribute, converter) in self.mapping.iteritems(): - if hasattr(self, attribute): - result[attribute] = unicode(getattr(self, attribute)) - - return result - - -def parse(file_name): - return BookInfo.from_file(file_name) - - -if __name__ == '__main__': - import sys - - info = parse(sys.argv[1]) - for attribute, _ in BookInfo.mapping.values(): - print '%s: %r' % (attribute, getattr(info, attribute, None)) - diff --git a/lib/librarian/html.py b/lib/librarian/html.py deleted file mode 100644 index b279e5dd5..000000000 --- a/lib/librarian/html.py +++ /dev/null @@ -1,242 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import cStringIO -import re -import copy -import pkgutil - -from lxml import etree - - -ENTITY_SUBSTITUTIONS = [ - (u'---', u'—'), - (u'--', u'–'), - (u'...', u'…'), - (u',,', u'„'), - (u'"', u'”'), -] - - -def substitute_entities(context, text): - """XPath extension function converting all entites in passed text.""" - if isinstance(text, list): - text = ''.join(text) - for entity, substitutution in ENTITY_SUBSTITUTIONS: - text = text.replace(entity, substitutution) - return text - - -# Register substitute_entities function with lxml -ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') -ns['substitute_entities'] = substitute_entities - - -def transform(input_filename, output_filename): - """Transforms file input_filename in XML to output_filename in XHTML.""" - # Parse XSLT - style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt') - style = etree.parse(style_filename) - - doc_file = cStringIO.StringIO() - expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); - - f = open(input_filename, 'r') - for line in f: - line = line.decode('utf-8') - line = expr.sub(u'
    \n', line) - doc_file.write(line.encode('utf-8')) - f.close() - - doc_file.seek(0); - - parser = etree.XMLParser(remove_blank_text=True) - doc = etree.parse(doc_file, parser) - - result = doc.xslt(style) - if result.find('//p') is not None: - add_anchors(result.getroot()) - add_table_of_contents(result.getroot()) - result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8') - return True - else: - return False - - -class Fragment(object): - def __init__(self, id, themes): - super(Fragment, self).__init__() - self.id = id - self.themes = themes - self.events = [] - - def append(self, event, element): - self.events.append((event, element)) - - def closed_events(self): - stack = [] - for event, element in self.events: - if event == 'start': - stack.append(('end', element)) - elif event == 'end': - try: - stack.pop() - except IndexError: - print 'CLOSED NON-OPEN TAG:', element - - stack.reverse() - return self.events + stack - - def to_string(self): - result = [] - for event, element in self.closed_events(): - if event == 'start': - result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) - if element.text: - result.append(element.text) - elif event == 'end': - result.append(u'' % element.tag) - if element.tail: - result.append(element.tail) - else: - result.append(element) - - return ''.join(result) - - def __unicode__(self): - return self.to_string() - - -def extract_fragments(input_filename): - """Extracts theme fragments from input_filename.""" - open_fragments = {} - closed_fragments = {} - - for event, element in etree.iterparse(input_filename, events=('start', 'end')): - # Process begin and end elements - if element.get('class', '') in ('theme-begin', 'theme-end'): - if not event == 'end': continue # Process elements only once, on end event - - # Open new fragment - if element.get('class', '') == 'theme-begin': - fragment = Fragment(id=element.get('fid'), themes=element.text) - - # Append parents - if element.getparent().get('id', None) != 'book-text': - parents = [element.getparent()] - while parents[-1].getparent().get('id', None) != 'book-text': - parents.append(parents[-1].getparent()) - - parents.reverse() - for parent in parents: - fragment.append('start', parent) - - open_fragments[fragment.id] = fragment - - # Close existing fragment - else: - try: - fragment = open_fragments[element.get('fid')] - except KeyError: - print '%s:closed not open fragment #%s' % (input_filename, element.get('fid')) - else: - closed_fragments[fragment.id] = fragment - del open_fragments[fragment.id] - - # Append element tail to lost_text (we don't want to lose any text) - if element.tail: - for fragment_id in open_fragments: - open_fragments[fragment_id].append('text', element.tail) - - - # Process all elements except begin and end - else: - # Omit annotation tags - if len(element.get('name', '')) or element.get('class', '') == 'annotation': - if event == 'end' and element.tail: - for fragment_id in open_fragments: - open_fragments[fragment_id].append('text', element.tail) - else: - for fragment_id in open_fragments: - open_fragments[fragment_id].append(event, copy.copy(element)) - - return closed_fragments, open_fragments - - -def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None): - if with_link: - if link_text is None: - link_text = prefix - anchor = etree.Element('a', href='#%s' % prefix) - anchor.set('class', 'anchor') - anchor.text = unicode(link_text) - if element.text: - anchor.tail = element.text - element.text = u'' - element.insert(0, anchor) - - if with_target: - anchor_target = etree.Element('a', name='%s' % prefix) - anchor_target.set('class', 'target') - anchor_target.text = u' ' - if element.text: - anchor_target.tail = element.text - element.text = u'' - element.insert(0, anchor_target) - - -def any_ancestor(element, test): - for ancestor in element.iterancestors(): - if test(ancestor): - return True - return False - - -def add_anchors(root): - counter = 1 - for element in root.iterdescendants(): - if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication') - or e.tag == 'blockquote'): - continue - - if element.tag == 'p' and 'verse' in element.get('class', ''): - if counter == 1 or counter % 5 == 0: - add_anchor(element, "f%d" % counter, link_text=counter) - counter += 1 - elif 'paragraph' in element.get('class', ''): - add_anchor(element, "f%d" % counter, link_text=counter) - counter += 1 - - -def add_table_of_contents(root): - sections = [] - counter = 1 - for element in root.iterdescendants(): - if element.tag in ('h2', 'h3'): - if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)): - continue - - if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2': - sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), [])) - else: - sections.append((counter, element.tag, ''.join(element.xpath('text()')), [])) - add_anchor(element, "s%d" % counter, with_link=False) - counter += 1 - - toc = etree.Element('div') - toc.set('id', 'toc') - toc_header = etree.SubElement(toc, 'h2') - toc_header.text = u'Spis treści' - toc_list = etree.SubElement(toc, 'ol') - - for n, section, text, subsections in sections: - section_element = etree.SubElement(toc_list, 'li') - add_anchor(section_element, "s%d" % n, with_target=False, link_text=text) - - if len(subsections): - subsection_list = etree.SubElement(section_element, 'ol') - for n, subsection, text, _ in subsections: - subsection_element = etree.SubElement(subsection_list, 'li') - add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text) - - root.insert(0, toc) - diff --git a/lib/librarian/tests/__init__.py b/lib/librarian/tests/__init__.py deleted file mode 100644 index 3f0254192..000000000 --- a/lib/librarian/tests/__init__.py +++ /dev/null @@ -1,115 +0,0 @@ -# -*- coding: utf-8 -*- -import unittest -from os.path import dirname, join, realpath - -from lxml import etree -from librarian import dcparser, html - - -def test_file_path(dir_name, file_name): - return realpath(join(dirname(__file__), 'files', dir_name, file_name)) - - -class TestDCParser(unittest.TestCase): - KNOWN_RESULTS = ( - ('dcparser', 'andersen_brzydkie_kaczatko.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': u'http://wiki.wolnepodreczniki.pl/Lektury:Andersen/Brzydkie_kaczątko', - 'source_name': u'Andersen, Hans Christian (1805-1875), Baśnie, Gebethner i Wolff, wyd. 7, Kraków, 1925', - 'author': u'Andersen, Hans Christian', - 'url': u'http://wolnelektury.pl/katalog/lektura/brzydkie-kaczatko', - 'created_at': u'2007-08-14', - 'title': u'Brzydkie kaczątko', - 'kind': u'Epika', - 'source_url': u'http://www.polona.pl/dlibra/doccontent2?id=3563&dirids=4', - 'translator': u'Niewiadomska, Cecylia', - 'released_to_public_domain_at': u'1925-01-01', - 'epoch': u'Romantyzm', - 'genre': u'Baśń', - 'technical_editor': u'Gałecki, Dariusz', - 'license_description': u'Domena publiczna - tłumacz Cecylia Niewiadomska zm. 1925', - }), - ('dcparser', 'kochanowski_piesn7.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': u'http://wiki.wolnepodreczniki.pl/Lektury:Kochanowski/Pieśni/Pieśń_VII_(1)', - 'source_name': u'Kochanowski, Jan (1530-1584), Dzieła polskie, tom 1, oprac. Julian Krzyżanowski, wyd. 8, Państwowy Instytut Wydawniczy, Warszawa, 1976', - 'author': u'Kochanowski, Jan', - 'url': u'http://wolnelektury.pl/katalog/lektura/piesni-ksiegi-pierwsze-piesn-vii-trudna-rada-w-tej-mierze-pr', - 'created_at': u'2007-08-31', - 'title': u'Pieśń VII (Trudna rada w tej mierze: przyjdzie się rozjechać...)', - 'kind': u'Liryka', - 'source_url': u'http://www.polona.pl/Content/1499', - 'released_to_public_domain_at': u'1584-01-01', - 'epoch': u'Renesans', - 'genre': u'Pieśń', - 'technical_editor': u'Gałecki, Dariusz', - 'license_description': u'Domena publiczna - Jan Kochanowski zm. 1584 ', - }), - ('dcparser', 'mickiewicz_rybka.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Mickiewicz/Ballady/Rybka', - 'source_name': u'Mickiewicz, Adam (1798-1855), Poezje, tom 1 (Wiersze młodzieńcze - Ballady i romanse - Wiersze do r. 1824), Krakowska Spółdzielnia Wydawnicza, wyd. 2 zwiększone, Kraków, 1922', - 'author': u'Mickiewicz, Adam', - 'url': u'http://wolnelektury.pl/katalog/lektura/ballady-i-romanse-rybka', - 'created_at': u'2007-09-06', - 'title': u'Rybka', - 'kind': u'Liryka', - 'source_url': u'http://www.polona.pl/Content/2222', - 'released_to_public_domain_at': u'1855-01-01', - 'epoch': u'Romantyzm', - 'genre': u'Ballada', - 'technical_editor': u'Sutkowska, Olga', - 'license_description': u'Domena publiczna - Adam Mickiewicz zm. 1855', - }), - ('dcparser', 'sofokles_antygona.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Sofokles/Antygona', - 'source_name': u'Sofokles (496-406 a.C.), Antygona, Zakład Narodowy im. Ossolińskich, wyd. 7, Lwów, 1939', - 'author': u'Sofokles', - 'url': u'http://wolnelektury.pl/katalog/lektura/antygona', - 'created_at': u'2007-08-30', - 'title': u'Antygona', - 'kind': u'Dramat', - 'source_url': u'http://www.polona.pl/Content/3768', - 'translator': u'Morawski, Kazimierz', - 'released_to_public_domain_at': u'1925-01-01', - 'epoch': u'Starożytność', - 'genre': u'Tragedia', - 'technical_editor': u'Gałecki, Dariusz', - 'license_description': u'Domena publiczna - tłumacz Kazimierz Morawski zm. 1925', - }), - ('dcparser', 'biedrzycki_akslop.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Biedrzycki/Akslop', - 'source_name': u'Miłosz Biedrzycki, * ("Gwiazdka"), Fundacja "brulion", Kraków-Warszawa, 1993', - 'author': u'Biedrzycki, Miłosz', - 'url': u'http://wolnelektury.pl/katalog/lektura/akslop', - 'created_at': u'2009-06-04', - 'title': u'Akslop', - 'kind': u'Liryka', - 'source_url': u'http://free.art.pl/mlb/gwiazdka.html#t1', - 'epoch': u'Współczesność', - 'genre': u'Wiersz', - 'technical_editor': u'Sutkowska, Olga', - 'license': u'http://creativecommons.org/licenses/by-sa/3.0/', - 'license_description': u'Creative Commons Uznanie Autorstwa - Na Tych Samych Warunkach 3.0.PL' - }), - ) - - def test_parse(self): - for dir_name, file_name, result in self.KNOWN_RESULTS: - self.assertEqual(dcparser.parse(test_file_path(dir_name, file_name)).to_dict(), result) - - -class TestParserErrors(unittest.TestCase): - def test_error(self): - try: - html.transform(test_file_path('erroneous', 'asnyk_miedzy_nami.xml'), - test_file_path('erroneous', 'asnyk_miedzy_nami.html')) - self.fail() - except etree.XMLSyntaxError, e: - self.assertEqual(e.position, (25, 13)) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/lib/librarian/tests/files/dcparser/andersen_brzydkie_kaczatko.xml b/lib/librarian/tests/files/dcparser/andersen_brzydkie_kaczatko.xml deleted file mode 100644 index d653a9b5f..000000000 --- a/lib/librarian/tests/files/dcparser/andersen_brzydkie_kaczatko.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - Andersen, Hans Christian - Brzydkie kaczątko - Niewiadomska, Cecylia - Gałecki, Dariusz - Fundacja Nowoczesna Polska - Romantyzm - Epika - Baśń - Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. - http://wolnelektury.pl/katalog/lektura/brzydkie-kaczatko - http://www.polona.pl/dlibra/doccontent2?id=3563&dirids=4 - Andersen, Hans Christian (1805-1875), Baśnie, Gebethner i Wolff, wyd. 7, Kraków, 1925 - Domena publiczna - tłumacz Cecylia Niewiadomska zm. 1925 - 1925 - xml - text - text - 2007-08-14 - SP1 - pol - - \ No newline at end of file diff --git a/lib/librarian/tests/files/dcparser/biedrzycki_akslop.xml b/lib/librarian/tests/files/dcparser/biedrzycki_akslop.xml deleted file mode 100644 index da0cd9fa6..000000000 --- a/lib/librarian/tests/files/dcparser/biedrzycki_akslop.xml +++ /dev/null @@ -1,25 +0,0 @@ - - - Biedrzycki, Miłosz - Akslop - Sekuła, Aleksandra - Sutkowska, Olga - Fundacja Nowoczesna Polska - Współczesność - Liryka - Wiersz - Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). - http://wolnelektury.pl/katalog/lektura/akslop - http://free.art.pl/mlb/gwiazdka.html#t1 - Miłosz Biedrzycki, * ("Gwiazdka"), Fundacja "brulion", Kraków-Warszawa, 1993 - Creative Commons Uznanie Autorstwa - Na Tych Samych Warunkach 3.0.PL - http://creativecommons.org/licenses/by-sa/3.0/ - xml - text - text - 2009-06-04 - L - pol - - \ No newline at end of file diff --git a/lib/librarian/tests/files/dcparser/kochanowski_piesn7.xml b/lib/librarian/tests/files/dcparser/kochanowski_piesn7.xml deleted file mode 100644 index 96be1ae0e..000000000 --- a/lib/librarian/tests/files/dcparser/kochanowski_piesn7.xml +++ /dev/null @@ -1,27 +0,0 @@ - - - Kochanowski, Jan - Pieśń VII (Trudna rada w tej mierze: przyjdzie się rozjechać...) - http://www.wolnelektury.pl/lektura/piesni-ksiegi-pierwsze - Sekuła, Aleksandra - Krzyżanowski, Julian - Otwinowska, Barbara - Gałecki, Dariusz - Fundacja Nowoczesna Polska - Renesans - Liryka - Pieśń - Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. - http://wolnelektury.pl/katalog/lektura/piesni-ksiegi-pierwsze-piesn-vii-trudna-rada-w-tej-mierze-pr - http://www.polona.pl/Content/1499 - Kochanowski, Jan (1530-1584), Dzieła polskie, tom 1, oprac. Julian Krzyżanowski, wyd. 8, Państwowy Instytut Wydawniczy, Warszawa, 1976 - Domena publiczna - Jan Kochanowski zm. 1584 - 1584 - xml - text - text - 2007-08-31 - L - pol - - \ No newline at end of file diff --git a/lib/librarian/tests/files/dcparser/mickiewicz_rybka.xml b/lib/librarian/tests/files/dcparser/mickiewicz_rybka.xml deleted file mode 100644 index 0796a5b0f..000000000 --- a/lib/librarian/tests/files/dcparser/mickiewicz_rybka.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - Mickiewicz, Adam - Rybka - http://www.wolnelektury.pl/lektura/ballady-i-romanse - Sekuła, Aleksandra - Kallenbach, Józef - Sutkowska, Olga - Fundacja Nowoczesna Polska - Romantyzm - Liryka - Ballada - Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. - http://wolnelektury.pl/katalog/lektura/ballady-i-romanse-rybka - http://www.polona.pl/Content/2222 - Mickiewicz, Adam (1798-1855), Poezje, tom 1 (Wiersze młodzieńcze - Ballady i romanse - Wiersze do r. 1824), Krakowska Spółdzielnia Wydawnicza, wyd. 2 zwiększone, Kraków, 1922 - Domena publiczna - Adam Mickiewicz zm. 1855 - 1855 - xml - text - text - 2007-09-06 - SP2 - G - L - pol - - \ No newline at end of file diff --git a/lib/librarian/tests/files/dcparser/sofokles_antygona.xml b/lib/librarian/tests/files/dcparser/sofokles_antygona.xml deleted file mode 100644 index 4acb2d4fc..000000000 --- a/lib/librarian/tests/files/dcparser/sofokles_antygona.xml +++ /dev/null @@ -1,25 +0,0 @@ - - - Sofokles - Antygona - Sekuła, Aleksandra - Morawski, Kazimierz - Gałecki, Dariusz - Fundacja Nowoczesna Polska - Starożytność - Dramat - Tragedia - Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. - http://wolnelektury.pl/katalog/lektura/antygona - http://www.polona.pl/Content/3768 - Sofokles (496-406 a.C.), Antygona, Zakład Narodowy im. Ossolińskich, wyd. 7, Lwów, 1939 - Domena publiczna - tłumacz Kazimierz Morawski zm. 1925 - 1925 - xml - text - text - 2007-08-30 - G - pol - - \ No newline at end of file diff --git a/lib/librarian/tests/files/erroneous/asnyk_miedzy_nami.html b/lib/librarian/tests/files/erroneous/asnyk_miedzy_nami.html deleted file mode 100644 index 1d7e17ff1..000000000 --- a/lib/librarian/tests/files/erroneous/asnyk_miedzy_nami.html +++ /dev/null @@ -1,46 +0,0 @@ -
    -
    -

    Spis treści

    -
      -
    -

    - Adam Asnyk - Między nami nic nie było -

    -
    -

    1Między nami nic nie było!

    -

    - Żadnych zwierzeń, wyznań żadnych!

    -

    - Nic nas z sobą nie łączyło —

    -

    - Prócz wiosennych marzeń zdradnych;

    -
    -
    -

    5Prócz tych woni, barw i blasków,

    -

    - Unoszących się w przestrzeni;

    -

    - Prócz szumiących śpiewem lasków

    -

    - I tej świeżej łąk zieleni;

    -
    -
    -

    Prócz tych kaskad i potoków,

    -

    10 - Zraszających każdy parów,

    -

    - Prócz girlandy tęcz, obłoków,

    -

    - Prócz natury słodkich czarów;

    -
    -
    -

    Prócz tych wspólnych, jasnych zdrojów,

    -

    - Z których serce zachwyt piło;

    -

    15 - Prócz pierwiosnków i powojów,—

    -

    - Między nami nic nie było!

    -
    -
    diff --git a/lib/librarian/tests/files/erroneous/asnyk_miedzy_nami.xml b/lib/librarian/tests/files/erroneous/asnyk_miedzy_nami.xml deleted file mode 100644 index aa5ef17ce..000000000 --- a/lib/librarian/tests/files/erroneous/asnyk_miedzy_nami.xml +++ /dev/null @@ -1,25 +0,0 @@ - - - Adam Asnyk - Między nami nic nie było - - Między nami nic nie było!/ - Żadnych zwierzeń, wyznań żadnych!/ - Nic nas z sobą nie łączyło ---/ - Prócz wiosennych marzeń zdradnych; - - Prócz tych woni, barw i blasków,/ - Unoszących się w przestrzeni;/ - Prócz szumiących śpiewem lasków/ - I tej świeżej łąk zieleni; - - Prócz tych kaskad i potoków,/ - Zraszających każdy parów,/ - Prócz girlandy tęcz, obłoków,/ - Prócz natury słodkich czarów; - - Prócz tych wspólnych, jasnych zdrojów,/ - Z których serce zachwyt piło;/ - Prócz pierwiosnków i powojów,---/ - Między nami nic nie było! - diff --git a/lib/librarian/text.py b/lib/librarian/text.py deleted file mode 100644 index 21fab8e33..000000000 --- a/lib/librarian/text.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import cStringIO -import re -import codecs - -from lxml import etree - -from librarian import dcparser - - -ENTITY_SUBSTITUTIONS = [ - (u'---', u'—'), - (u'--', u'–'), - (u'...', u'…'), - (u',,', u'„'), - (u'"', u'”'), -] - - -MAX_LINE_LENGTH = 80 - - -def substitute_entities(context, text): - """XPath extension function converting all entites in passed text.""" - if isinstance(text, list): - text = ''.join(text) - for entity, substitutution in ENTITY_SUBSTITUTIONS: - text = text.replace(entity, substitutution) - return text - - -def wrap_words(context, text): - """XPath extension function automatically wrapping words in passed text""" - if isinstance(text, list): - text = ''.join(text) - words = re.split(r'\s', text) - - line_length = 0 - lines = [[]] - for word in words: - line_length += len(word) + 1 - if line_length > MAX_LINE_LENGTH: - # Max line length was exceeded. We create new line - lines.append([]) - line_length = len(word) - lines[-1].append(word) - return '\n'.join(' '.join(line) for line in lines) - - -# Register substitute_entities function with lxml -ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') -ns['substitute_entities'] = substitute_entities -ns['wrap_words'] = wrap_words - - -def transform(input_filename, output_filename): - """Transforms file input_filename in XML to output_filename in TXT.""" - # Parse XSLT - style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt') - style = etree.parse(style_filename) - - doc_file = cStringIO.StringIO() - expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); - - f = open(input_filename, 'r') - for line in f: - line = line.decode('utf-8') - line = expr.sub(u'
    \n', line) - doc_file.write(line.encode('utf-8')) - f.close() - - doc_file.seek(0) - - parser = etree.XMLParser(remove_blank_text=True) - doc = etree.parse(doc_file, parser) - - result = doc.xslt(style) - output_file = codecs.open(output_filename, 'wb', encoding='utf-8') - output_file.write(unicode(result) % dcparser.parse(input_filename).url) - -- 2.20.1