From 6642c1c71c5c6ce6ef3401c8c9da84cf076b018b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20St=C4=99pniowski?= Date: Fri, 19 Mar 2010 17:00:19 +0100 Subject: [PATCH] Version 1.2.1. --- .gitignore | 3 + MANIFEST.in | 2 +- README.txt | 49 +++ librarian/__init__.py | 11 + librarian/__init__.pyc | Bin 151 -> 0 bytes librarian/book2txt.xslt | 19 +- librarian/dcparser.py | 323 ++++++++++++------ librarian/dcparser.pyc | Bin 8696 -> 0 bytes librarian/html.py | 30 +- librarian/html.pyc | Bin 8882 -> 0 bytes librarian/parser.py | 70 ++++ librarian/tests/__init__.py | 115 ------- librarian/tests/files/dcparser/.DS_Store | Bin 6148 -> 0 bytes librarian/text.py | 48 +-- librarian/text.pyc | Bin 2993 -> 0 bytes scripts/book2html | 24 +- scripts/book2txt | 30 +- setup.py | 12 +- tests/__init__.py | 0 .../dcparser/andersen_brzydkie_kaczatko.out | 17 + .../dcparser/andersen_brzydkie_kaczatko.xml | 0 tests/files/dcparser/biedrzycki_akslop.out | 17 + .../files/dcparser/biedrzycki_akslop.xml | 0 tests/files/dcparser/kochanowski_piesn7.out | 18 + .../files/dcparser/kochanowski_piesn7.xml | 2 +- tests/files/dcparser/mickiewicz_rybka.out | 18 + .../files/dcparser/mickiewicz_rybka.xml | 0 tests/files/dcparser/sofokles_antygona.out | 19 ++ .../files/dcparser/sofokles_antygona.xml | 0 .../andersen_brzydkie_kaczatko.out | 0 .../andersen_brzydkie_kaczatko.xml | 24 ++ tests/files/dcserialize/biedrzycki_akslop.out | 0 tests/files/dcserialize/biedrzycki_akslop.xml | 25 ++ .../files/dcserialize/kochanowski_piesn7.out | 0 .../files/dcserialize/kochanowski_piesn7.xml | 27 ++ tests/files/dcserialize/mickiewicz_rybka.out | 0 tests/files/dcserialize/mickiewicz_rybka.xml | 28 ++ tests/files/dcserialize/sofokles_antygona.out | 0 tests/files/dcserialize/sofokles_antygona.xml | 25 ++ .../files/erroneous/asnyk_miedzy_nami.html | 0 .../files/erroneous/asnyk_miedzy_nami.xml | 0 tests/files/text/asnyk_miedzy_nami.txt | 0 tests/files/text/asnyk_miedzy_nami.xml | 25 ++ tests/test_dcparser.py | 56 +++ tests/test_text.py | 22 ++ tests/utils.py | 62 ++++ 46 files changed, 836 insertions(+), 285 deletions(-) create mode 100644 README.txt delete mode 100644 librarian/__init__.pyc delete mode 100644 librarian/dcparser.pyc delete mode 100644 librarian/html.pyc create mode 100644 librarian/parser.py delete mode 100644 librarian/tests/__init__.py delete mode 100644 librarian/tests/files/dcparser/.DS_Store delete mode 100644 librarian/text.pyc mode change 100644 => 100755 setup.py create mode 100644 tests/__init__.py create mode 100644 tests/files/dcparser/andersen_brzydkie_kaczatko.out rename {librarian/tests => tests}/files/dcparser/andersen_brzydkie_kaczatko.xml (100%) create mode 100644 tests/files/dcparser/biedrzycki_akslop.out rename {librarian/tests => tests}/files/dcparser/biedrzycki_akslop.xml (100%) create mode 100644 tests/files/dcparser/kochanowski_piesn7.out rename {librarian/tests => tests}/files/dcparser/kochanowski_piesn7.xml (99%) create mode 100644 tests/files/dcparser/mickiewicz_rybka.out rename {librarian/tests => tests}/files/dcparser/mickiewicz_rybka.xml (100%) create mode 100644 tests/files/dcparser/sofokles_antygona.out rename {librarian/tests => tests}/files/dcparser/sofokles_antygona.xml (100%) create mode 100644 tests/files/dcserialize/andersen_brzydkie_kaczatko.out create mode 100644 tests/files/dcserialize/andersen_brzydkie_kaczatko.xml create mode 100644 tests/files/dcserialize/biedrzycki_akslop.out create mode 100644 tests/files/dcserialize/biedrzycki_akslop.xml create mode 100644 tests/files/dcserialize/kochanowski_piesn7.out create mode 100644 tests/files/dcserialize/kochanowski_piesn7.xml create mode 100644 tests/files/dcserialize/mickiewicz_rybka.out create mode 100644 tests/files/dcserialize/mickiewicz_rybka.xml create mode 100644 tests/files/dcserialize/sofokles_antygona.out create mode 100644 tests/files/dcserialize/sofokles_antygona.xml rename {librarian/tests => tests}/files/erroneous/asnyk_miedzy_nami.html (100%) rename {librarian/tests => tests}/files/erroneous/asnyk_miedzy_nami.xml (100%) create mode 100644 tests/files/text/asnyk_miedzy_nami.txt create mode 100644 tests/files/text/asnyk_miedzy_nami.xml create mode 100755 tests/test_dcparser.py create mode 100755 tests/test_text.py create mode 100644 tests/utils.py diff --git a/.gitignore b/.gitignore index 7189e7b..bfdc1af 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ *.pyc MANIFEST dist +build +nbproject +nbproject/* diff --git a/MANIFEST.in b/MANIFEST.in index 4c76fc3..9b7ec3d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ include librarian/*.xslt -recursive-include librarian/tests/files/ *.xml +recursive-include tests/files/ *.xml diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..3debf76 --- /dev/null +++ b/README.txt @@ -0,0 +1,49 @@ +Librarian +========= +Librarian (*ang. bibliotekarz*) to biblioteka służąca do konwersji języka składu książek opartego na XML opracowanego przez Fundację Nowoczesna Polska na inne formaty. + +Obecnie obsługiwane są formaty: + + * XML + * TXT + +Biblioteka librarian potrafi również parsować metadane opisane przez DublinCore oraz wyciągać fragmenty motywów z lektur. + + +Wymagania +--------- + + * [lxml 2.2](http://codespeak.net/lxml/) + + +Instalacja +---------- +Zainstaluj biblioteki z sekcji *Wymagania* powyżej. Następnie rozpakuj archiwum z biblioteką librarian, przejdź w terminalu do rozpakowanego katalogu i wpisz: + +
python setup.py install
+ +Na Linuxie i OSX mogą być wymagane uprawnienia administratora. W takim wypadku wpisz: + +
sudo python setup.py install
+ +Alternatywnie możesz zainstalować bibliotekę librarian w wybranym przez siebie katalogu. W takim wypadku należy użyć argumentu *prefix* do *setup.py*: + +
python setup.py install --prefix=ŚCIEŻKA_DO_WYBRANEGO_KATALOGU
+ +W takim wypadku będzie jednak potrzebne własnoręczne edytowanie zmiennych systemowych *PATH* i *PYTHONPATH*. + + +Sposób użycia +------------- +Konwersja plików lektur do XHTML: + +
book2html LEKTURA1 LEKTURA2...
+ +Konwersja plików lektur do TXT: + +
book2txt LEKTURA1 LEKTURA2...
+ +Wyciągnięcie wszystkich fragmentów motywów z wygenerowanych plików XHTML: + +
bookfragments PLIK1 PLIK2...
+ diff --git a/librarian/__init__.py b/librarian/__init__.py index e69de29..9132f5c 100644 --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +# exception classes + +class ParseError(Exception): + pass + +class ValidationError(Exception): + pass + +class NoDublinCore(ValidationError): + pass diff --git a/librarian/__init__.pyc b/librarian/__init__.pyc deleted file mode 100644 index 3d4eb1301d13decc7aa425513c20d7c2e466f51b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 151 zcmdn|iI+?LgQ{mT0~9a + + -Kodowanie znaków w dokumencie: UTF-8. ------ -Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez -Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje -się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać. - -Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %s. ------ - - - @@ -150,8 +141,8 @@ Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dost - -/ / + +/ / @@ -167,7 +158,7 @@ Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dost - + diff --git a/librarian/dcparser.py b/librarian/dcparser.py index 557509c..830b089 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -3,14 +3,10 @@ from xml.parsers.expat import ExpatError from datetime import date import time -# Import ElementTree from anywhere -try: - import xml.etree.ElementTree as etree # Python >= 2.5 -except ImportError: - try: - import elementtree.ElementTree as etree # effbot's pure Python module - except ImportError: - import lxml.etree as etree # ElementTree API using libxml2 +from librarian import ValidationError, NoDublinCore + +import lxml.etree as etree # ElementTree API using libxml2 +from lxml.etree import XMLSyntaxError # ============== @@ -21,7 +17,22 @@ class Person(object): def __init__(self, last_name, *first_names): self.last_name = last_name self.first_names = first_names - + + @classmethod + def from_text(cls, text): + parts = [ token.strip() for token in text.split(',') ] + if len(parts) == 1: + surname = parts[0] + names = [] + elif len(parts) != 2: + raise ValueError("Invalid person name. There should be at most one comma: \"%s\"." % text) + else: + surname = parts[0] + if len(parts[1]) == 0: + # there is no non-whitespace data after the comma + raise ValueError("Found a comma, but no names given: \"%s\" -> %r." % (text, parts)) + names = [ name for name in parts[1].split() if len(name) ] # all non-whitespace tokens + return cls(surname, *names) def __eq__(self, right): return self.last_name == right.last_name and self.first_names == right.first_names @@ -32,52 +43,71 @@ class Person(object): return '%s, %s' % (self.last_name, ' '.join(self.first_names)) else: return self.last_name - - + def __repr__(self): return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names) +def as_date(text): + try: + try: + t = time.strptime(text, '%Y-%m-%d') + except ValueError: + t = time.strptime(text, '%Y') + return date(t[0], t[1], t[2]) + except ValueError, e: + raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.") -def str_to_unicode(value, previous): - return unicode(value) - +def as_person(text): + return Person.from_text(text) -def str_to_unicode_list(value, previous): - if previous is None: - previous = [] - previous.append(str_to_unicode(value, None)) - return previous +def as_unicode(text): + if isinstance(text, unicode): + return text + else: + return text.decode('utf-8') +class Field(object): + def __init__(self, uri, attr_name, type=as_unicode, multiple=False, salias=None, **kwargs): + self.uri = uri + self.name = attr_name + self.validator = type + self.multiple = multiple + self.salias = salias -def str_to_person(value, previous): - comma_count = value.count(',') - - if comma_count == 0: - last_name, first_names = value, [] - elif comma_count == 1: - last_name, first_names = value.split(',') - first_names = [name for name in first_names.split(' ') if len(name)] - else: - raise ValueError("value contains more than one comma: %r" % value) - - return Person(last_name.strip(), *first_names) + self.required = kwargs.get('required', True) and not kwargs.has_key('default') + self.default = kwargs.get('default', [] if multiple else [None]) + def validate_value(self, val): + try: + if self.multiple: + if self.validator is None: + return val + return [ self.validator(v) if v is not None else v for v in val ] + elif len(val) > 1: + raise ValidationError("Mulitply values not allowed for field '%s'" % self.uri) + elif len(val) == 0: + raise ValidationError("Field %s has no value to assign. Check your defaults." % self.uri) + else: + if self.validator is None or val[0] is None: + return val[0] + return self.validator(val[0]) + except ValueError, e: + raise ValidationError("Field '%s' - invald value: %s" % (self.uri, e.message)) -def str_to_date(value, previous): - try: - t = time.strptime(value, '%Y-%m-%d') - except ValueError: - t = time.strptime(value, '%Y') - return date(t[0], t[1], t[2]) + def validate(self, fdict): + if not fdict.has_key(self.uri): + if not self.required: + f = self.default + else: + raise ValidationError("Required field %s not found" % self.uri) + else: + f = fdict[self.uri] + return self.validate_value(f) # ========== # = Parser = # ========== -class ParseError(Exception): - def __init__(self, message): - super(ParseError, self).__init__(message) - class XMLNamespace(object): '''Represents XML namespace.''' @@ -102,96 +132,175 @@ class BookInfo(object): RDF = XMLNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') DC = XMLNamespace('http://purl.org/dc/elements/1.1/') - mapping = { - DC('creator') : ('author', str_to_person), - DC('title') : ('title', str_to_unicode), - DC('subject.period') : ('epoch', str_to_unicode), - DC('subject.type') : ('kind', str_to_unicode), - DC('subject.genre') : ('genre', str_to_unicode), - DC('date') : ('created_at', str_to_date), - DC('date.pd') : ('released_to_public_domain_at', str_to_date), - DC('contributor.translator') : ('translator', str_to_person), - DC('contributor.technical_editor') : ('technical_editor', str_to_person), - DC('publisher') : ('publisher', str_to_unicode), - DC('source') : ('source_name', str_to_unicode), - DC('source.URL') : ('source_url', str_to_unicode), - DC('identifier.url') : ('url', str_to_unicode), - DC('relation.hasPart') : ('parts', str_to_unicode_list), - DC('rights.license') : ('license', str_to_unicode), - DC('rights') : ('license_description', str_to_unicode), - } + FIELDS = ( + Field( DC('creator'), 'author', as_person), + Field( DC('title'), 'title'), + Field( DC('subject.period'), 'epoches', salias='epoch', multiple=True), + Field( DC('subject.type'), 'kinds', salias='kind', multiple=True), + Field( DC('subject.genre'), 'genres', salias='genre', multiple=True), + Field( DC('date'), 'created_at', as_date), + Field( DC('date.pd'), 'released_to_public_domain_at', as_date, required=False), + Field( DC('contributor.editor'), 'editors', \ + as_person, salias='editor', multiple=True, default=[]), + Field( DC('contributor.translator'), 'translators', \ + as_person, salias='translator', multiple=True, default=[]), + Field( DC('contributor.technical_editor'), 'technical_editors', + as_person, salias='technical_editor', multiple=True, default=[]), + Field( DC('publisher'), 'publisher'), + Field( DC('source'), 'source_name', required=False), + Field( DC('source.URL'), 'source_url', required=False), + Field( DC('identifier.url'), 'url'), + Field( DC('relation.hasPart'), 'parts', multiple=True, required=False), + Field( DC('rights.license'), 'license', required=False), + Field( DC('rights'), 'license_description'), + ) @classmethod def from_string(cls, xml): from StringIO import StringIO return cls.from_file(StringIO(xml)) - + @classmethod - def from_file(cls, xml_file): - book_info = cls() - + def from_file(cls, xmlfile): + desc_tag = None try: - tree = etree.parse(xml_file) + iter = etree.iterparse(xmlfile, ['start', 'end']) + for (event, element) in iter: + if element.tag == cls.RDF('RDF') and event == 'start': + desc_tag = element + break + + if desc_tag is None: + raise NoDublinCore("DublinCore section not found. \ + Check if there are rdf:RDF and rdf:Description tags.") + + # continue 'till the end of RDF section + for (event, element) in iter: + if element.tag == cls.RDF('RDF') and event == 'end': + break + + # if there is no end, Expat should yell at us with an ExpatError + + # extract data from the element and make the info + return cls.from_element(desc_tag) + except XMLSyntaxError, e: + raise ParseError(e) except ExpatError, e: raise ParseError(e) - description = tree.find('//' + book_info.RDF('Description')) - book_info.wiki_url = description.get(cls.RDF('about'), None) - - if description is None: - raise ParseError('no Description tag found in document') - - for element in description.findall('*'): - book_info.parse_element(element) + @classmethod + def from_element(cls, rdf_tag): + # the tree is already parsed, so we don't need to worry about Expat errors + field_dict = {} + desc = rdf_tag.find(".//" + cls.RDF('Description') ) - return book_info + if desc is None: + raise NoDublinCore("No DublinCore section found.") + + for e in desc.getchildren(): + fv = field_dict.get(e.tag, []) + fv.append(e.text) + field_dict[e.tag] = fv + + return cls( desc.attrib, field_dict ) + + def __init__(self, rdf_attrs, dc_fields): + """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description. + dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the + given field. """ + + self.about = rdf_attrs.get(self.RDF('about')) + self.fmap = {} + + for field in self.FIELDS: + value = field.validate( dc_fields ) + setattr(self, 'prop_' + field.name, value) + self.fmap[field.name] = field + if field.salias: self.fmap[field.salias] = field + + def __getattribute__(self, name): + try: + field = object.__getattribute__(self, 'fmap')[name] + value = object.__getattribute__(self, 'prop_'+field.name) + if field.name == name: + return value + else: # singular alias + if not field.multiple: + raise "OUCH!! for field %s" % name + + return value[0] + except (KeyError, AttributeError): + return object.__getattribute__(self, name) - def parse_element(self, element): + def __setattr__(self, name, newvalue): try: - attribute, converter = self.mapping[element.tag] - setattr(self, attribute, converter(element.text, getattr(self, attribute, None))) - except KeyError: - pass + field = object.__getattribute__(self, 'fmap')[name] + if field.name == name: + object.__setattr__(self, 'prop_'+field.name, newvalue) + else: # singular alias + if not field.multiple: + raise "OUCH! while setting field %s" % name - def to_xml(self): + object.__setattr__(self, 'prop_'+field.name, [newvalue]) + except (KeyError, AttributeError): + return object.__setattr__(self, name, newvalue) + + def update(self, field_dict): + """Update using field_dict. Verify correctness, but don't check if all + required fields are present.""" + for field in self.FIELDS: + if field_dict.has_key(field.name): + setattr(self, field.name, field_dict[field.name]) + + def to_etree(self, parent = None): """XML representation of this object.""" - etree._namespace_map[str(self.RDF)] = 'rdf' - etree._namespace_map[str(self.DC)] = 'dc' + #etree._namespace_map[str(self.RDF)] = 'rdf' + #etree._namespace_map[str(self.DC)] = 'dc' - root = etree.Element(self.RDF('RDF')) + if parent is None: + root = etree.Element(self.RDF('RDF')) + else: + root = parent.makeelement(self.RDF('RDF')) + description = etree.SubElement(root, self.RDF('Description')) - if self.wiki_url: - description.set(self.RDF('about'), self.wiki_url) + if self.about: + description.set(self.RDF('about'), self.about) - for tag, (attribute, converter) in self.mapping.iteritems(): - if hasattr(self, attribute): - e = etree.Element(tag) - e.text = unicode(getattr(self, attribute)) - description.append(e) + for field in self.FIELDS: + v = getattr(self, field.name, None) + if v is not None: + if field.multiple: + if len(v) == 0: continue + for x in v: + e = etree.Element(field.uri) + e.text = unicode(x) + description.append(e) + else: + e = etree.Element(field.uri) + e.text = unicode(v) + description.append(e) - return unicode(etree.tostring(root, 'utf-8'), 'utf-8') + return root def to_dict(self): - etree._namespace_map[str(self.RDF)] = 'rdf' - etree._namespace_map[str(self.DC)] = 'dc' - - result = {'about': self.wiki_url} - for tag, (attribute, converter) in self.mapping.iteritems(): - if hasattr(self, attribute): - result[attribute] = unicode(getattr(self, attribute)) + result = {'about': self.about} + for field in self.FIELDS: + v = getattr(self, field.name, None) + + if v is not None: + if field.multiple: + if len(v) == 0: continue + v = [ unicode(x) for x in v ] + else: + v = unicode(v) + result[field.name] = v + + if field.salias: + v = getattr(self, field.salias) + if v is not None: result[field.salias] = v return result - def parse(file_name): return BookInfo.from_file(file_name) - - -if __name__ == '__main__': - import sys - - info = parse(sys.argv[1]) - for attribute, _ in BookInfo.mapping.values(): - print '%s: %r' % (attribute, getattr(info, attribute, None)) - diff --git a/librarian/dcparser.pyc b/librarian/dcparser.pyc deleted file mode 100644 index 0e911b8d97be8a856f2b1d6fcbfb500ab12c811a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8696 zcmcIpTXPi074Df`t+YZ{1S@1?vKTgz9kgpms$x4hq%hdV7a5OiV`T7Tw9}GCTFore zy+}$RRS7)iAuoAwRo?UHO64&Rsmf2vA4tCM^y~#(NySxb!Swc=zMu1*)90M|+rMU~ z{}Eq*zpaYDdHntok9q1T7306B_Lb^*YSU95-+i_3t4>L6mfU+u?U&VNSsmA;J65+y^UV&Z z*xy{AIzOPEQt^UJby{g0bW!d3D*I}{fn~M1tTft|RdPnfC)LUmaBM{#T~+#&N?uUO zi(K=@{U51&zS=yiHeXWllG10DKBx2wD>(6!lG^rF?cMhteB#|n$?uZdSUm-4B>%?W zDkP%pG>V%?-N@d|^DK9?GH7vR^}4N~c3bBrO9v9=sOxyl6%-nb0{c+<6fOByqD)yS za}TUcsJJYZiV8MXCph+JC^nLGr=#m#nZACQ*xh<3GPa&Z2f7}maXqSc61->I_3b1t z8ceh8V{8czf8WJpYA7Cg>XA|*D!x=?QyX%e^;Pz?RTP{*~>n6WQ$K8c-2k{gz42DVSj~i()7Ssf|Og0>W?zuIXA-Pv722QpR5w!lrz8+XE7r4Ai(!wtU7f}Of$2bX619MQ9P=qf zG|!weq-9^jN@fa$@@BnhsEx>r@FDL-Y5t0e(-^1@4)hr3gURs494?BH01N!bCH1(h z(i{XaC^VLzN-NBy8;N_;^+Nyh;|cYsOcRQW3A5g~e@;EFC^E0Y_v{45qDw`ce9%@N zp5IQ$0$35I2HpbpR71z#dt~V&Sg;B81UoIOKak->{koW^OJcOJ5ExrUVSb20P*rbd zsg07<)DN;;*X?eU)*%>>4-TR?>y2Dy5&PZFda0F~>2?wu2#8sHPI$__EgImu*gWZq zk_H2D%55hThcegDdI%uGFE8Jh3iWx+qNm0e0CyMtPf1unp}cv2)?0+=PkYz=1z&_u zkP;Ch{7qEg20~xKKyaLbgcNJJu1Kdj=aqu842Z$B^l$HEUz?-5Y zBvKOmHSVr84%Qm6=w0LPP-eKEpbS($;g3z=IdFS8QI{{li+BkrU2z~rI4(A2%)I1{ zQ}B}B*>%1W2FfczdFH%jPfYP`x*LLuHXHfJ-AMOER5V;Nu^~@U@#&JWS=i6xQUt<& zUhLp2a17pqsbB`o2~bbJ{TZu5bu~T6H)x`ov?&Zl)N&)YkG4qML{tVCa z*BKX=EfS4Gm&lW7WkdF~F0RyX8s>4sI1f(t$HO_lmUGx>XS_Qu+?saO>4f1ISmD3! zoHrV8+<|(CV)T`Q+Z0taHp;DGXd@mfly`*VucFtO_fI5-u<`C@3Bxb(-aiB7&SwA+ znv((dYkj`cG>?LLl`bd8IfO`!YCYbai)Z*1ru~k)USEo=9 zh8Bm+6SEWin6)Mg>S54AG31!k#^FB(oGHL#D0w)(ezoNO36!sA+5W9`J8Pp+{wTQX zc+5B8lQT#f7bDC&(hDWUx&Rrm`!3uh_*r^_TIp*E=F#hF8P}+;42a@CfjBWr6ekMA z$x-4I7Xh(4N}Mi;CzPHX%sEpKXGe*11#y0KOA7^Yag=znAR-?!+~-n3JT*!@T@Y)d z#N~o`W|X*65MNNpKJ^)b9E_X@S(o8LFBL@OVTQyX2yqifpH^|5xeziOhsNvq{wp$1 zFSE|f(*>y@^K?-v^SlHa6dZB2Yi;+!WKkztY+^fFYPy@X zcMY*_dRu$CZJW68lPng`Xg^6~!C=459(Uca9i8IF5;w3ICKNG`vXG9$$chhT&tmc% zTZ;^c2#LDAtxnQLD0u)!8NCc+1G+xVZY&enJW5T6o0TLG4PTyY_3+*tHd$(=)$Lt` z>QN`ual-yw>zV$~b&!0KY{2a5Ts&x#_42lM`6IcJ<=q*2*8~oWN9KAqKMOt-yH7G1 zm&D+CvYj9q1d?zSxgl%BAI8BWn`CKoH!^VHtcU`~*QwDmTGBYCS&%HRprRhe+O*;J zbGQi8brqM67iAfbT(}odfrdk`_c0i;BeI}y5a?d_ojU+?Lma8~ktE?1#%+ML>CUZ> z-3@jF6k*YJo*jhSNk_XB8kcK#j2!#upcAm4m^&%PVe?$-j;q2NV@KG3kw^0vh7lx@ zB1irwD(Lb=4vTDY5h;k7LqK>ZJwFH~P9u~+7|9S1ksKI-@J_-A26!-l!X-p{U>_Qn z@i2~*^fZ}>2IO4Dvxc_}hK5sYja{&g+#TkP2lK*>KDsEQs|+YH?#BHkMxhv(Oc#Nu z_uwz;4ha2Vh1XrHB^Rn`@`9lC0#FqQuV??cRoVi$sGP@5t|twgK~(a?2bh`$WvZ|+ zH7+E5r#}eT5Yz~Uws21&E4rPzD2rR+J?^r!quAyO{rOOJ$QlQzf}5D_cX$k=9_3g4 zlDFh7c#Hm7?;OI^1%Cn8>!J%%%jm)TR`7KXmE-G#7@R@6gq~WtClM&n<?P_KX>T@E`Sm~eB4QwfWnFdw38Chy6+RiIEyD21|J}61Jnof9))1EjjYWb>-3Sc z8xRnQLm_>jk6nVFn->>4xXT%;DAm>xOZAzM_ zqeT^tE`S1EbBgLX2*yD{u0rvip*rV-B+1C2m(xJ>MOKV-sFQ?1Y?9t!ICDs7np9a` z@M6z`JP!ARGJejd|mNBcs+$0fKb{Nl37FD?)*%Q6iBw2>0U>_|Gj4lQ@vksULs zG^3ylgC1OPAhIxw;we2kyq7K`^$7#fS7-ClcrmEl+LAqq)Dx)d|x*AN&B*4W6Q zGR=**1#35*+aOooMsKTl4L)JF5(;23VAe)}lmJ6`4vY-NttN<+Men0eaE}GQ-G~*j znKS3kPKfJ4sODK_1BPM(_yD1-3Vy}n4vO(2RBldq9WYIQ#^i-1sChO2jQ65H?bTre zPQoV4duP1UF2T=j@)6t;+}{PaySSKv3t-m=Y_9^bN4EuR^0iaIc1>XB7mx$qz$!vX z3`h17z*c3TIUo^T_>b8htOHT&xIB2BlacE|gfiG-K^%zf7wHFW(gyMlV!l$OBFX^U zQ~_+83I?K+Z+I>ar`7_Cy^eiOKx3AT*uN}{leYZ^OBNYK3I`$)v7Gl;CH$hJ-iZ^kbpA^N}mEtlpe=1)~8t4nR#PCnFh!62_)S`aZ~K*MN8BH$8oE{W$wN9nkU z;t$U@mScT{rT&hm2I)nSLE}65mP5_ zP#B20tMQv!b*_4%Hc?xueO@ik L0\n', line) - doc_file.write(line.encode('utf-8')) - f.close() - - doc_file.seek(0); + if is_file: + document = WLDocument.from_file(input, True) + else: + document = WLDocument.from_string(input, True) - parser = etree.XMLParser(remove_blank_text=True) - doc = etree.parse(doc_file, parser) + result = document.transform(style) + del document # no longer needed large object :) - result = doc.xslt(style) if result.find('//p') is not None: add_anchors(result.getroot()) add_table_of_contents(result.getroot()) - result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8') + + if output_filename is not None: + result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8') + else: + return result return True else: return False diff --git a/librarian/html.pyc b/librarian/html.pyc deleted file mode 100644 index dfb837db2bd1077d904719c9d591cb02dd42d5bc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8882 zcmb_iU2GiJb-pu8F1ag`qDY!DEj!vsp}n$2GF4IAP-(@sC`&>lN}ZKTnH95H?hMHx zmov+`vl5v~K@hqr`c@zT`cNcq$#c=X1n5%>B#$jx6etP=C{UmeeJarRqW!+_?k*|C zE)bZa=JNjEbMHClJLlZXpa1u0`O&|ipLwxqvNJ9Mg%#Jau{k&&ZyT$}1XG}C=Hn8{vYYvV1bam8(lWHC_(P{n%wc(Wc48)u< z;VWvIFyXW^lO{Z?%oMxTs&8O}@&DrAa@9jNA4C%6Srl~&$j!~o>G!{X??)Y9eMI{NVP3PwTaZAJ>q1_~7=v79hYgy1)I5Un?DtTH-JNa+4 z+*Un>B%|@Jr$}Fczlv9ryVOR&!Y^9kMMVr9lG|eFct}Sn@ zx1xHwQ5TXVxz*C2G+H*Qa=cLFML7^FUXlO@Hu(YO>WPacvaHg1a`7PZo}^zu!M=^e6u#=6_D8(e{nJ3LqIcSx@Lu!E z{-{6UO`vYt8^_OiZ`jv%KFJ!JyI^ROL-~)=O&zkoha`gah58v8Mi_HX8CY^(8JO{c zGK18~&(OJOf|?9OlVZ{Pu5-RCNPv4dE?^Z<0w(Q{(4ul3_6wHFnEHU(g!w}Ta~dc$ z=~=GwP#q&W?I;W0XOT)$ug6K8*XvS`IMhk!ThVT0ofJO7)@+q@h(R`LZ3MrMQm}~R z1epWbgqJ?V-_z!iCyHaE+Dn zM7S|20!$qFcanRdr*I3OB_YOZk_7|!$ON(SFOUJ3;9~3_bhop#mK<|MBr4nRUe1ca z*efIkeo1$)d*i4^AK$Rg?-=i3Ozw-5h?TwXngQs}cG;xph<}i>n;3ob7FxVV9F32z z4ICcHkWLikvo*OC%UzvqWVv(@=O=@H{m#AGjit)+z2&)k_irp$9(;83s-#t?9dO_3 z-%i_daBe4Iw0|SZ(o9F4Mevb;Cp?QIR=@*l?gSsQ?js~8$OxEqfHKiDtYc|N@;6ao zUqxcPqF07bGU}bhSMbL0Gwqj;ctt!Hul@s>F0cNf%PSn&(%RpC9=jgB$S$ZTjy{II zOS{1W9C$p)Ghc$i#(yw$nx>mgMUzhT#=A0G_Wl_koLKUEQ2c(k*9rav`yHEN=%p#( zqa2?C42R0V&0*!FS%D7%;=p%Q)T-3H<$y}q;qDh_ZDrP86FgU%l;HwC^3L+Rgk=8$ zPh{3h(eb=H1xVb)$)c5LR1f}Z)WYn>gXPfkzP>JRNry;kONY$caoS64@?f}aM)O-Omr;~y)alJ5?HVAreOz8L|ukK z{BWEhkpbPaBl?v4(t(U@BH5)ADte`2sZbg#jq3z?>}0l1@XtZ|me8B2Z}y-1E*`WA9~)>+PD43aODe(J?Q>JH(>6e`^Zd~Z7BPXJgPka0wCTVf<|6YX!>&JqLlt0 z;iseXaBr>>brv2ydlSTcgfYwuT)5aUgLf`=%4`=*`aOU~uXz)yiYVtBjB?=k&wSya za~Ome1~Ja@j#${+`<3w&-zdED2n39nrWFt&zQNrC437jN#23^D$9JGcjimsS@&GjkFa_3Rx$%?##ynHo3%gx2kd>yq?Zxu zU*alDYi}4tTI?{49fLlYO!dJTd3%5Dt?Yf@Cks486Wyg0=b;%JZeyu#ZJ&$dVhrN2 zJ{MC$^KZBCJAQxR{dXLp7hfb4v%`?ZI0F}4lH5PSs0*;**tl#+tvW@?okMbCA3kj} zw-vc@m5r`{ibyU?cVCL+=AGRpi5Z3wHZ6(CwU{`3at)}u|_RS11lhDRVjQ09Ik z4&@K3ok8CmBQ{LNK34#>`2G*>_g@sBj#VL01SgJyUM!CM?h6A*mDg#NA|>YLGl zoS8tEOq>M)vGBi{whyYDB)H7ZT$tOe_fJ0`M^)Dq=~o_^2uTxL8Q}U&;Pa!v0M!`9vc^4$xTS$yw^ostp8S!Vlsltr+)xvp%MQHNN?agY!92Xi~@eqwJj#2d9L6PAPL~R9$%;>BW6X*wISQn%f=gUZt*;gWU!$mq$SzIWnu4|K} z1x~E#=s?wRwIJIp4@T?bf@LG#*Y(5GINz!xSl^LMs^k_|D;soY6Tm(ayQt5+d8oTi zCn>lBi~iNdPqCE!8WQ6T!3ipRGYHjB<2N1sDSt@iQn%bBgiwk3NgBV746TI@_>fNi zIofw{A2JFfkPb^SYI~!A;8C*z&$Sm{a}Mp4D8n0Fo8zV>AT7Z=NL=_y_rm#$XqV3S zx}M~jP@Edc0k3;R#^0bUr^diZgrkVlXwu;$&Ha${KT&{|F(m!<@e}xaU90K@+@%F1 zH>I;O>6{haLpUEM{WkPLH|SImTt7(6>FLl$LW4bdI%iyA$V|gWVf+d&gUSmncbZ18 zh8Mb?(?|_`kJs_M%7?yT$(QDz{lXdCvc1uYdHNcMSO}QRgdZ`fhRvnOemg+`Qxq3Sd+ql@LpoUQxH{FdTi|ehlxwF^7 zKo>gfqb@jL@*O6Bj6`NGSY`1pizHM)5uK#K3}e>CR%3TPY+U;#@&c7z_TzTOfb8gs zNw<h&87=~kRkI}Oig{SwNb_|*s zEHl%+;kkNLS%mW{Wx>o`3=uqx9r7?XdLDqpg3Jy1ie9pyk-xTJgGt4=0%r3)XhXsD5tYVKaesUC9KI?J@BU#&Q3qgm)9r^o^3 zpRn!+NUDUN;QLI7pALC=3FO>+0=MvOUr$1$Rn}TTL$J-_4!aPoYn}D31Ld40ddF2F zL~zJ1#84aQG1vyscX#y@{2ALg96PCRMU4=C6}z&i_g\n', data) + + try: + parser = etree.XMLParser(remove_blank_text=True) + return cls( etree.parse(StringIO(data), parser) ) + except XMLSyntaxError, e: + raise ParseError(e.message) + except ExpatError, e: + raise ParseError(e.message) + + def transform(self, stylesheet, **options): + return self.edoc.xslt(stylesheet, **options) + + def update_dc(self): + parent = self.rdf_elem.getparent() + parent.replace( self.rdf_elem, self.book_info.to_etree(parent) ) + + def serialize(self): + self.update_dc() + return etree.tostring(self.edoc, encoding=unicode, pretty_print=True) diff --git a/librarian/tests/__init__.py b/librarian/tests/__init__.py deleted file mode 100644 index 3f02541..0000000 --- a/librarian/tests/__init__.py +++ /dev/null @@ -1,115 +0,0 @@ -# -*- coding: utf-8 -*- -import unittest -from os.path import dirname, join, realpath - -from lxml import etree -from librarian import dcparser, html - - -def test_file_path(dir_name, file_name): - return realpath(join(dirname(__file__), 'files', dir_name, file_name)) - - -class TestDCParser(unittest.TestCase): - KNOWN_RESULTS = ( - ('dcparser', 'andersen_brzydkie_kaczatko.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': u'http://wiki.wolnepodreczniki.pl/Lektury:Andersen/Brzydkie_kaczątko', - 'source_name': u'Andersen, Hans Christian (1805-1875), Baśnie, Gebethner i Wolff, wyd. 7, Kraków, 1925', - 'author': u'Andersen, Hans Christian', - 'url': u'http://wolnelektury.pl/katalog/lektura/brzydkie-kaczatko', - 'created_at': u'2007-08-14', - 'title': u'Brzydkie kaczątko', - 'kind': u'Epika', - 'source_url': u'http://www.polona.pl/dlibra/doccontent2?id=3563&dirids=4', - 'translator': u'Niewiadomska, Cecylia', - 'released_to_public_domain_at': u'1925-01-01', - 'epoch': u'Romantyzm', - 'genre': u'Baśń', - 'technical_editor': u'Gałecki, Dariusz', - 'license_description': u'Domena publiczna - tłumacz Cecylia Niewiadomska zm. 1925', - }), - ('dcparser', 'kochanowski_piesn7.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': u'http://wiki.wolnepodreczniki.pl/Lektury:Kochanowski/Pieśni/Pieśń_VII_(1)', - 'source_name': u'Kochanowski, Jan (1530-1584), Dzieła polskie, tom 1, oprac. Julian Krzyżanowski, wyd. 8, Państwowy Instytut Wydawniczy, Warszawa, 1976', - 'author': u'Kochanowski, Jan', - 'url': u'http://wolnelektury.pl/katalog/lektura/piesni-ksiegi-pierwsze-piesn-vii-trudna-rada-w-tej-mierze-pr', - 'created_at': u'2007-08-31', - 'title': u'Pieśń VII (Trudna rada w tej mierze: przyjdzie się rozjechać...)', - 'kind': u'Liryka', - 'source_url': u'http://www.polona.pl/Content/1499', - 'released_to_public_domain_at': u'1584-01-01', - 'epoch': u'Renesans', - 'genre': u'Pieśń', - 'technical_editor': u'Gałecki, Dariusz', - 'license_description': u'Domena publiczna - Jan Kochanowski zm. 1584 ', - }), - ('dcparser', 'mickiewicz_rybka.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Mickiewicz/Ballady/Rybka', - 'source_name': u'Mickiewicz, Adam (1798-1855), Poezje, tom 1 (Wiersze młodzieńcze - Ballady i romanse - Wiersze do r. 1824), Krakowska Spółdzielnia Wydawnicza, wyd. 2 zwiększone, Kraków, 1922', - 'author': u'Mickiewicz, Adam', - 'url': u'http://wolnelektury.pl/katalog/lektura/ballady-i-romanse-rybka', - 'created_at': u'2007-09-06', - 'title': u'Rybka', - 'kind': u'Liryka', - 'source_url': u'http://www.polona.pl/Content/2222', - 'released_to_public_domain_at': u'1855-01-01', - 'epoch': u'Romantyzm', - 'genre': u'Ballada', - 'technical_editor': u'Sutkowska, Olga', - 'license_description': u'Domena publiczna - Adam Mickiewicz zm. 1855', - }), - ('dcparser', 'sofokles_antygona.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Sofokles/Antygona', - 'source_name': u'Sofokles (496-406 a.C.), Antygona, Zakład Narodowy im. Ossolińskich, wyd. 7, Lwów, 1939', - 'author': u'Sofokles', - 'url': u'http://wolnelektury.pl/katalog/lektura/antygona', - 'created_at': u'2007-08-30', - 'title': u'Antygona', - 'kind': u'Dramat', - 'source_url': u'http://www.polona.pl/Content/3768', - 'translator': u'Morawski, Kazimierz', - 'released_to_public_domain_at': u'1925-01-01', - 'epoch': u'Starożytność', - 'genre': u'Tragedia', - 'technical_editor': u'Gałecki, Dariusz', - 'license_description': u'Domena publiczna - tłumacz Kazimierz Morawski zm. 1925', - }), - ('dcparser', 'biedrzycki_akslop.xml', { - 'publisher': u'Fundacja Nowoczesna Polska', - 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Biedrzycki/Akslop', - 'source_name': u'Miłosz Biedrzycki, * ("Gwiazdka"), Fundacja "brulion", Kraków-Warszawa, 1993', - 'author': u'Biedrzycki, Miłosz', - 'url': u'http://wolnelektury.pl/katalog/lektura/akslop', - 'created_at': u'2009-06-04', - 'title': u'Akslop', - 'kind': u'Liryka', - 'source_url': u'http://free.art.pl/mlb/gwiazdka.html#t1', - 'epoch': u'Współczesność', - 'genre': u'Wiersz', - 'technical_editor': u'Sutkowska, Olga', - 'license': u'http://creativecommons.org/licenses/by-sa/3.0/', - 'license_description': u'Creative Commons Uznanie Autorstwa - Na Tych Samych Warunkach 3.0.PL' - }), - ) - - def test_parse(self): - for dir_name, file_name, result in self.KNOWN_RESULTS: - self.assertEqual(dcparser.parse(test_file_path(dir_name, file_name)).to_dict(), result) - - -class TestParserErrors(unittest.TestCase): - def test_error(self): - try: - html.transform(test_file_path('erroneous', 'asnyk_miedzy_nami.xml'), - test_file_path('erroneous', 'asnyk_miedzy_nami.html')) - self.fail() - except etree.XMLSyntaxError, e: - self.assertEqual(e.position, (25, 13)) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/librarian/tests/files/dcparser/.DS_Store b/librarian/tests/files/dcparser/.DS_Store deleted file mode 100644 index 8817fe608d38202e1a6a534b520570045b510a0f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!AiqG5Pe%KRJ` zzU^e^?d;o-?E-M)_t`Zt1Tdfr_Dx`a-FgAMXY5`Ch2d;v}A=-vSuB5 zMx3?ABfa2Sqy|}8MTORQVzlbaw2$wG)np2o0*9*r&up>5X+UdD0aL&f*eW332TvDF zA~pf_)4@is0K`7SYV6BhLUMe>Bw`bgPiQWr5<{wsR}2@@8TWBq60r#w(&2Pw^2C{4 zyrDRqopGtDuXqqz`rW+4Xmw^kpKVy diff --git a/librarian/text.py b/librarian/text.py index db0d2b2..0754a99 100644 --- a/librarian/text.py +++ b/librarian/text.py @@ -18,7 +18,20 @@ ENTITY_SUBSTITUTIONS = [ ] -MAX_LINE_LENGTH = 80 +TEMPLATE = u"""\ +Kodowanie znaków w dokumencie: UTF-8. +----- +Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez +Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje +się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać. + +Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %(url)s. +----- + + + +%(text)s +""" def strip(context, text): @@ -37,17 +50,20 @@ def substitute_entities(context, text): return text -def wrap_words(context, text): +def wrap_words(context, text, wrapping): """XPath extension function automatically wrapping words in passed text""" if isinstance(text, list): text = ''.join(text) + if not wrapping: + return text + words = re.split(r'\s', text) line_length = 0 lines = [[]] for word in words: line_length += len(word) + 1 - if line_length > MAX_LINE_LENGTH: + if line_length > wrapping: # Max line length was exceeded. We create new line lines.append([]) line_length = len(word) @@ -62,28 +78,22 @@ ns['substitute_entities'] = substitute_entities ns['wrap_words'] = wrap_words -def transform(input_filename, output_filename): +def transform(input_filename, output_filename, **options): """Transforms file input_filename in XML to output_filename in TXT.""" # Parse XSLT style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt') style = etree.parse(style_filename) - doc_file = cStringIO.StringIO() - expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); - - f = open(input_filename, 'r') - for line in f: - line = line.decode('utf-8') - line = expr.sub(u'
\n', line) - doc_file.write(line.encode('utf-8')) - f.close() + if is_file: + document = WLDocument.from_file(input, True) + else: + document = WLDocument.from_string(input, True) - doc_file.seek(0) + result = document.transform(style, **options) - parser = etree.XMLParser(remove_blank_text=True) - doc = etree.parse(doc_file, parser) - - result = doc.xslt(style) output_file = codecs.open(output_filename, 'wb', encoding='utf-8') - output_file.write(unicode(result) % dcparser.parse(input_filename).url) + output_file.write(TEMPLATE % { + 'url': dcparser.parse(input_filename).url, + 'text': unicode(result), + }) diff --git a/librarian/text.pyc b/librarian/text.pyc deleted file mode 100644 index 6c6eb91802d9dea061615b0f83adc4b5dc39c1c6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2993 zcmb_eUvC>l5T8B&N$kW;nxtvcLgj*lEY#RlUnmetX%irjy3xgH#<<7a? zYaHZAmC83LukgqlPdxH%_y~Lf_|2S?lp-V~>OXrkyK_6UJHOxT{`yCyZ0paz>eBEt zj^|zc?JsBo{1ojIZJo)Aw$y3AM_b;hoTIIr$R6$IDJ;-dffT==qx~XGY1o`3rKAx6)9nnCMQ}n{4cpB?hWIXP|l1>ywid6qS zayeSszDAEdI>9f;<0el780Ppz7#_Y)kI=hbV=wkoG@EN3_oS-(PDj?nQFS+sx{ifz z96i&CGtpkP6NXhCIpeggnyA|ASgV7o!%KApMpWYsa%D8en#j6N)YXpH5E|=5cp96? z&7fzw*;;9R+g{(ew+>;WweqmJ?nW?>=w8_A>KeZ$9q`W{7?CKB-35p?-LVKM z#TVh$&_mMLk0CWSkQ>|hF5S^dV=alF>V0?IIEuqahY-``xZVpJq1oYo(n(AwYRGze z$8P%7$#q*?8g1ven?X(9q?%SEG_GcqWQzmi9guAO2H`(s3#meCf-HxUFW?&~+auaT zO7qm;=EN;+Z_r7B^9K+P3Y_jBPnSR7>&BplCem$8vge-22`>62`mYr4<>oa@& zc{D;B)|mN1o7>1GXajrzl|uml&kN$E?4hJ#(Wc0>02Y94lnFCRFNo>lN@P&-VFO`I z+HtzaMXx}v!f*n=SS?b2nFd8L>k~QwBQOW5zW8Ak9wwMA=x1y6fLW7bc5!G1=jt#* z9R-7?*uAiCn!qR?;vi?3Snd)A)ye2-*r>#a7L%4}fFNh3Dq|sKa+k>;=Zk!z0>aGbjighn_Z5E@> z0jt%`2dh;VSL4*ZKGNE3)olfn?8Nc@Er;6Px1qC)+s0!nTuUsoEOomJw^Ih{;!e`I zQx;;oi_jB&5I@uHov;(_x20w?7ms!v16oJjI6$Fj)~-Smg2WkkdpLnI)-W6blW?%| zPFrGcw|(?yWRO!AIR=K-U4rRW9*X6u;!mL pol - \ No newline at end of file + diff --git a/tests/files/dcparser/mickiewicz_rybka.out b/tests/files/dcparser/mickiewicz_rybka.out new file mode 100644 index 0000000..ff4bd98 --- /dev/null +++ b/tests/files/dcparser/mickiewicz_rybka.out @@ -0,0 +1,18 @@ +{ + 'editors': [u'Sekuła, Aleksandra', u'Kallenbach, Józef'], + 'publisher': u'Fundacja Nowoczesna Polska', + 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Mickiewicz/Ballady/Rybka', + 'source_name': u'Mickiewicz, Adam (1798-1855), Poezje, tom 1 (Wiersze młodzieńcze - Ballady i romanse - Wiersze do r. 1824), Krakowska Spółdzielnia Wydawnicza, wyd. 2 zwiększone, Kraków, 1922', + 'author': u'Mickiewicz, Adam', + 'url': u'http://wolnelektury.pl/katalog/lektura/ballady-i-romanse-rybka', + 'created_at': u'2007-09-06', + 'title': u'Rybka', + 'kind': u'Liryka', + 'source_url': u'http://www.polona.pl/Content/2222', + 'released_to_public_domain_at': u'1855-01-01', + 'epoch': u'Romantyzm', + 'genre': u'Ballada', + 'technical_editors': [u'Sutkowska, Olga'], + 'license_description': u'Domena publiczna - Adam Mickiewicz zm. 1855', +} + diff --git a/librarian/tests/files/dcparser/mickiewicz_rybka.xml b/tests/files/dcparser/mickiewicz_rybka.xml similarity index 100% rename from librarian/tests/files/dcparser/mickiewicz_rybka.xml rename to tests/files/dcparser/mickiewicz_rybka.xml diff --git a/tests/files/dcparser/sofokles_antygona.out b/tests/files/dcparser/sofokles_antygona.out new file mode 100644 index 0000000..0f2b4d0 --- /dev/null +++ b/tests/files/dcparser/sofokles_antygona.out @@ -0,0 +1,19 @@ +{ + 'editors': [u'Sekuła, Aleksandra'], + 'publisher': u'Fundacja Nowoczesna Polska', + 'about': 'http://wiki.wolnepodreczniki.pl/Lektury:Sofokles/Antygona', + 'source_name': u'Sofokles (496-406 a.C.), Antygona, Zakład Narodowy im. Ossolińskich, wyd. 7, Lwów, 1939', + 'author': u'Sofokles', + 'url': u'http://wolnelektury.pl/katalog/lektura/antygona', + 'created_at': u'2007-08-30', + 'title': u'Antygona', + 'kind': u'Dramat', + 'source_url': u'http://www.polona.pl/Content/3768', + 'translators': [u'Morawski, Kazimierz'], + 'released_to_public_domain_at': u'1925-01-01', + 'epoch': u'Starożytność', + 'genre': u'Tragedia', + 'technical_editors': [u'Gałecki, Dariusz'], + 'license_description': u'Domena publiczna - tłumacz Kazimierz Morawski zm. 1925', +} + diff --git a/librarian/tests/files/dcparser/sofokles_antygona.xml b/tests/files/dcparser/sofokles_antygona.xml similarity index 100% rename from librarian/tests/files/dcparser/sofokles_antygona.xml rename to tests/files/dcparser/sofokles_antygona.xml diff --git a/tests/files/dcserialize/andersen_brzydkie_kaczatko.out b/tests/files/dcserialize/andersen_brzydkie_kaczatko.out new file mode 100644 index 0000000..e69de29 diff --git a/tests/files/dcserialize/andersen_brzydkie_kaczatko.xml b/tests/files/dcserialize/andersen_brzydkie_kaczatko.xml new file mode 100644 index 0000000..d653a9b --- /dev/null +++ b/tests/files/dcserialize/andersen_brzydkie_kaczatko.xml @@ -0,0 +1,24 @@ + + + Andersen, Hans Christian + Brzydkie kaczątko + Niewiadomska, Cecylia + Gałecki, Dariusz + Fundacja Nowoczesna Polska + Romantyzm + Epika + Baśń + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/lektura/brzydkie-kaczatko + http://www.polona.pl/dlibra/doccontent2?id=3563&dirids=4 + Andersen, Hans Christian (1805-1875), Baśnie, Gebethner i Wolff, wyd. 7, Kraków, 1925 + Domena publiczna - tłumacz Cecylia Niewiadomska zm. 1925 + 1925 + xml + text + text + 2007-08-14 + SP1 + pol + + \ No newline at end of file diff --git a/tests/files/dcserialize/biedrzycki_akslop.out b/tests/files/dcserialize/biedrzycki_akslop.out new file mode 100644 index 0000000..e69de29 diff --git a/tests/files/dcserialize/biedrzycki_akslop.xml b/tests/files/dcserialize/biedrzycki_akslop.xml new file mode 100644 index 0000000..da0cd9f --- /dev/null +++ b/tests/files/dcserialize/biedrzycki_akslop.xml @@ -0,0 +1,25 @@ + + + Biedrzycki, Miłosz + Akslop + Sekuła, Aleksandra + Sutkowska, Olga + Fundacja Nowoczesna Polska + Współczesność + Liryka + Wiersz + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). + http://wolnelektury.pl/katalog/lektura/akslop + http://free.art.pl/mlb/gwiazdka.html#t1 + Miłosz Biedrzycki, * ("Gwiazdka"), Fundacja "brulion", Kraków-Warszawa, 1993 + Creative Commons Uznanie Autorstwa - Na Tych Samych Warunkach 3.0.PL + http://creativecommons.org/licenses/by-sa/3.0/ + xml + text + text + 2009-06-04 + L + pol + + \ No newline at end of file diff --git a/tests/files/dcserialize/kochanowski_piesn7.out b/tests/files/dcserialize/kochanowski_piesn7.out new file mode 100644 index 0000000..e69de29 diff --git a/tests/files/dcserialize/kochanowski_piesn7.xml b/tests/files/dcserialize/kochanowski_piesn7.xml new file mode 100644 index 0000000..b4d8d2e --- /dev/null +++ b/tests/files/dcserialize/kochanowski_piesn7.xml @@ -0,0 +1,27 @@ + + + Kochanowski, Jan + Pieśń VII (Trudna rada w tej mierze: przyjdzie się rozjechać...) + http://www.wolnelektury.pl/lektura/piesni-ksiegi-pierwsze + Sekuła, Aleksandra + Krzyżanowski, Julian + Otwinowska, Barbara + Gałecki, Dariusz + Fundacja Nowoczesna Polska + Renesans + Liryka + Pieśń + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/lektura/piesni-ksiegi-pierwsze-piesn-vii-trudna-rada-w-tej-mierze-pr + http://www.polona.pl/Content/1499 + Kochanowski, Jan (1530-1584), Dzieła polskie, tom 1, oprac. Julian Krzyżanowski, wyd. 8, Państwowy Instytut Wydawniczy, Warszawa, 1976 + Domena publiczna - Jan Kochanowski zm. 1584 + 1584 + xml + text + text + 2007-08-31 + L + pol + + diff --git a/tests/files/dcserialize/mickiewicz_rybka.out b/tests/files/dcserialize/mickiewicz_rybka.out new file mode 100644 index 0000000..e69de29 diff --git a/tests/files/dcserialize/mickiewicz_rybka.xml b/tests/files/dcserialize/mickiewicz_rybka.xml new file mode 100644 index 0000000..0796a5b --- /dev/null +++ b/tests/files/dcserialize/mickiewicz_rybka.xml @@ -0,0 +1,28 @@ + + + Mickiewicz, Adam + Rybka + http://www.wolnelektury.pl/lektura/ballady-i-romanse + Sekuła, Aleksandra + Kallenbach, Józef + Sutkowska, Olga + Fundacja Nowoczesna Polska + Romantyzm + Liryka + Ballada + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/lektura/ballady-i-romanse-rybka + http://www.polona.pl/Content/2222 + Mickiewicz, Adam (1798-1855), Poezje, tom 1 (Wiersze młodzieńcze - Ballady i romanse - Wiersze do r. 1824), Krakowska Spółdzielnia Wydawnicza, wyd. 2 zwiększone, Kraków, 1922 + Domena publiczna - Adam Mickiewicz zm. 1855 + 1855 + xml + text + text + 2007-09-06 + SP2 + G + L + pol + + \ No newline at end of file diff --git a/tests/files/dcserialize/sofokles_antygona.out b/tests/files/dcserialize/sofokles_antygona.out new file mode 100644 index 0000000..e69de29 diff --git a/tests/files/dcserialize/sofokles_antygona.xml b/tests/files/dcserialize/sofokles_antygona.xml new file mode 100644 index 0000000..4acb2d4 --- /dev/null +++ b/tests/files/dcserialize/sofokles_antygona.xml @@ -0,0 +1,25 @@ + + + Sofokles + Antygona + Sekuła, Aleksandra + Morawski, Kazimierz + Gałecki, Dariusz + Fundacja Nowoczesna Polska + Starożytność + Dramat + Tragedia + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/lektura/antygona + http://www.polona.pl/Content/3768 + Sofokles (496-406 a.C.), Antygona, Zakład Narodowy im. Ossolińskich, wyd. 7, Lwów, 1939 + Domena publiczna - tłumacz Kazimierz Morawski zm. 1925 + 1925 + xml + text + text + 2007-08-30 + G + pol + + \ No newline at end of file diff --git a/librarian/tests/files/erroneous/asnyk_miedzy_nami.html b/tests/files/erroneous/asnyk_miedzy_nami.html similarity index 100% rename from librarian/tests/files/erroneous/asnyk_miedzy_nami.html rename to tests/files/erroneous/asnyk_miedzy_nami.html diff --git a/librarian/tests/files/erroneous/asnyk_miedzy_nami.xml b/tests/files/erroneous/asnyk_miedzy_nami.xml similarity index 100% rename from librarian/tests/files/erroneous/asnyk_miedzy_nami.xml rename to tests/files/erroneous/asnyk_miedzy_nami.xml diff --git a/tests/files/text/asnyk_miedzy_nami.txt b/tests/files/text/asnyk_miedzy_nami.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/files/text/asnyk_miedzy_nami.xml b/tests/files/text/asnyk_miedzy_nami.xml new file mode 100644 index 0000000..5716a28 --- /dev/null +++ b/tests/files/text/asnyk_miedzy_nami.xml @@ -0,0 +1,25 @@ + + + Adam Asnyk + Między nami nic nie było + + Między nami nic nie było!/ + Żadnych zwierzeń, wyznań żadnych!/ + Nic nas z sobą nie łączyło ---/ + Prócz wiosennych marzeń zdradnych; + + Prócz tych woni, barw i blasków,/ + Unoszących się w przestrzeni;/ + Prócz szumiących śpiewem lasków/ + I tej świeżej łąk zieleni; + + Prócz tych kaskad i potoków,/ + Zraszających każdy parów,/ + Prócz girlandy tęcz, obłoków,/ + Prócz natury słodkich czarów; + + Prócz tych wspólnych, jasnych zdrojów,/ + Z których serce zachwyt piło;/ + Prócz pierwiosnków i powojów,---/ + Między nami nic nie było! + diff --git a/tests/test_dcparser.py b/tests/test_dcparser.py new file mode 100755 index 0000000..62e664c --- /dev/null +++ b/tests/test_dcparser.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import unittest + +from lxml import etree +from utils import get_file_path +from librarian import dcparser, html, ParseError +from utils import AutoTestMetaclass + +class TestDCParser(unittest.TestCase): + __metaclass__ = AutoTestMetaclass + + TEST_DIR = 'dcparser' + + def run_auto_test(self, in_data, out_data): + info = dcparser.BookInfo.from_string(in_data).to_dict() + should_be = eval(out_data) + for key in should_be: + self.assertEqual( info[key], should_be[key] ) + +class TestDCSerialize(unittest.TestCase): + __metaclass__ = AutoTestMetaclass + + TEST_DIR = 'dcserialize' + + def run_auto_test(self, in_data, out_data): + import lxml.etree + # first parse the input + info = dcparser.BookInfo.from_string(in_data) + + # serialize + serialized = lxml.etree.tostring(info.to_etree(), encoding=unicode).encode('utf-8') + + # then parse again + info_bis = dcparser.BookInfo.from_string(serialized) + + # check if they are the same + for key in vars(info): + self.assertEqual( getattr(info, key), getattr(info_bis, key)) + + for key in vars(info_bis): + self.assertEqual( getattr(info, key), getattr(info_bis, key)) + +class TestParserErrors(unittest.TestCase): + def test_error(self): + try: + html.transform(get_file_path('erroneous', 'asnyk_miedzy_nami.xml'), + get_file_path('erroneous', 'asnyk_miedzy_nami.html')) + self.fail() + except ParseError: + pass + #self.assertEqual(e.position, (25, 13)) + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_text.py b/tests/test_text.py new file mode 100755 index 0000000..00fd787 --- /dev/null +++ b/tests/test_text.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# encoding: utf-8 + +import unittest + +from utils import get_file_path +from librarian import dcparser +from librarian import text, NoDublinCore + + +class TestXML(unittest.TestCase): + def test_no_dublincore(self): + try: + text.transform(get_file_path('text', 'asnyk_miedzy_nami.xml'), + get_file_path('text', 'asnyk_miedzy_nami.txt')) + self.fail() + except NoDublinCore, e: + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..1870a07 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,62 @@ +from __future__ import with_statement + +import os +from distutils.core import Command +from unittest import TextTestRunner, TestLoader +from glob import glob +from os.path import dirname, join, realpath, splitext, basename, walk +from os import listdir +import codecs + +class AutoTestMetaclass(type): + + def __new__(cls, name, bases, class_dict): + test_dir = class_dict.pop('TEST_DIR') + path = realpath( join(dirname(__file__), 'files', test_dir) ) + + for file in listdir(path): + base, ext = splitext(file) + if ext != '.xml': + continue + + class_dict['test_'+base] = cls.make_test_runner(base, \ + join(path, base +'.xml'), join(path, base + '.out') ) + + return type.__new__(cls, name, bases, class_dict) + + @staticmethod + def make_test_runner(name, inputf, outputf): + def runner(self): + with open(inputf, 'rb') as ifd: + with codecs.open(outputf, 'rb', encoding='utf-8') as ofd: + self.run_auto_test(ifd.read(), ofd.read()) + return runner + + +def get_file_path(dir_name, file_name): + return realpath(join(dirname(__file__), 'files', dir_name, file_name)) + +class TestCommand(Command): + user_options = [] + + def initialize_options(self): + self._dir = os.getcwd() + + def finalize_options(self): + pass + + def run(self): + ''' + Finds all the tests modules in tests/, and runs them. + ''' + testfiles = [] + for t in glob(join(self._dir, 'tests', '*.py')): + module_name = splitext(basename(t))[0] + if module_name.startswith('test'): + testfiles.append('.'.join(['tests', module_name]) + ) + + tests = TestLoader().loadTestsFromNames(testfiles) + t = TextTestRunner(verbosity=2) + t.run(tests) + -- 2.20.1