From: Radek Czajka Date: Fri, 27 May 2022 13:35:06 +0000 (+0200) Subject: more meta X-Git-Tag: 2.4.3~1 X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/1f9c252b0da609f7dff02ab74b60f39f57dd2e3d?ds=sidebyside more meta --- diff --git a/src/librarian/__init__.py b/src/librarian/__init__.py index 95ea3fe..68afe74 100644 --- a/src/librarian/__init__.py +++ b/src/librarian/__init__.py @@ -86,46 +86,6 @@ PLMETNS = XMLNamespace("http://dl.psnc.pl/schemas/plmet/") WLNS = EmptyNamespace() -@six.python_2_unicode_compatible -class WLURI(object): - """Represents a WL URI. Extracts slug from it.""" - slug = None - - example = 'http://wolnelektury.pl/katalog/lektura/template/' - _re_wl_uri = re.compile( - r'http://(www\.)?wolnelektury.pl/katalog/lektur[ay]/' - '(?P[-a-z0-9]+)/?$' - ) - - def __init__(self, uri): - uri = six.text_type(uri) - self.uri = uri - self.slug = uri.rstrip('/').rsplit('/', 1)[-1] - - @classmethod - def strict(cls, uri): - match = cls._re_wl_uri.match(uri) - if not match: - raise ValidationError(u'Invalid URI (%s). Should match: %s' % ( - uri, cls._re_wl_uri.pattern)) - return cls(uri) - - @classmethod - def from_slug(cls, slug): - """Contructs an URI from slug. - - >>> print(WLURI.from_slug('a-slug').uri) - http://wolnelektury.pl/katalog/lektura/a-slug/ - - """ - uri = 'http://wolnelektury.pl/katalog/lektura/%s/' % slug - return cls(uri) - - def __str__(self): - return self.uri - - def __eq__(self, other): - return self.slug == other.slug class DocProvider(object): @@ -138,11 +98,6 @@ class DocProvider(object): """Should return a file-like object with a WL document XML.""" raise NotImplementedError - def by_uri(self, uri, wluri=WLURI): - """Should return a file-like object with a WL document XML.""" - wluri = wluri(uri) - return self.by_slug(wluri.slug) - class DirDocProvider(DocProvider): """ Serve docs from a directory of files in form .xml """ @@ -157,6 +112,7 @@ class DirDocProvider(DocProvider): from . import dcparser +from .meta.types.wluri import WLURI DEFAULT_BOOKINFO = dcparser.BookInfo( diff --git a/src/librarian/dcparser.py b/src/librarian/dcparser.py index fe4b3fd..f402c04 100644 --- a/src/librarian/dcparser.py +++ b/src/librarian/dcparser.py @@ -9,7 +9,7 @@ import six from librarian.util import roman_to_int from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS, - XMLNS, WLURI, WLNS, PLMETNS) + XMLNS, WLNS, PLMETNS) import lxml.etree as etree from lxml.etree import XMLSyntaxError @@ -17,16 +17,16 @@ from lxml.etree import XMLSyntaxError from librarian.meta.types.bool import BoolValue from librarian.meta.types.date import DateValue from librarian.meta.types.person import Person +from librarian.meta.types.wluri import WLURI from librarian.meta.types.text import TextValue class Field(object): - def __init__(self, uri, attr_name, validator=TextValue, strict=None, + def __init__(self, uri, attr_name, value_type=TextValue, multiple=False, salias=None, **kwargs): self.uri = uri self.name = attr_name - self.validator = validator - self.strict = strict + self.value_type = value_type self.multiple = multiple self.salias = salias @@ -35,24 +35,12 @@ class Field(object): self.default = kwargs.get('default', [] if multiple else [None]) def validate_value(self, val, strict=False): - if strict and self.strict is not None: - validator = self.strict - else: - validator = self.validator + #if strict: + # value.validate() + try: if self.multiple: - if validator is None: - return val - new_values = [] - for v in val: - nv = v - if v is not None: - #nv = validator(v) - nv = v - if hasattr(v, 'lang'): - setattr(nv, 'lang', v.lang) - new_values.append(nv) - return new_values + return val elif len(val) > 1: raise ValidationError( "Multiple values not allowed for field '%s'" % self.uri @@ -63,13 +51,7 @@ class Field(object): % self.uri ) else: - if validator is None or val[0] is None: - return val[0] - #nv = validator(val[0]) - nv = val[0] - if hasattr(val[0], 'lang') and not hasattr(validator, 'no_lang'): - setattr(nv, 'lang', val[0].lang) - return nv + return val[0] except ValueError as e: raise ValidationError( "Field '%s' - invald value: %s" @@ -219,25 +201,25 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): p = p.getparent() for e in desc.getchildren(): - field = cls.get_field_by_uri(e.tag) + tag = e.tag + if tag == 'meta': + meta_id = e.attrib.get('id') + if meta_id and meta_id.endswith('-id'): + tag = meta_id + + field = cls.get_field_by_uri(tag) if field is None: # Ignore unknown fields. - ### TODO: does it do for isbn? continue - fv = field_dict.get(e.tag, []) + fv = field_dict.get(tag, []) if e.text is not None: - val = field.validator(e.text) + val = field.value_type.from_text(e.text) val.lang = e.attrib.get(XMLNS('lang'), lang) - - if e.tag == 'meta': - meta_id = e.attrib.get('id') - if meta_id and meta_id.endswith('-id'): - field_dict[meta_id] = [val.replace('ISBN-', 'ISBN ')] else: val = e.text fv.append(val) - field_dict[e.tag] = fv + field_dict[tag] = fv return cls(desc.attrib, field_dict, *args, **kwargs) @@ -255,6 +237,7 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): for field in self.FIELDS: value = field.validate(dc_fields, fallbacks=fallbacks, strict=strict, validate_required=validate_required) + print(field.name, value) setattr(self, 'prop_' + field.name, value) self.fmap[field.name] = field if field.salias: diff --git a/src/librarian/meta/types/date.py b/src/librarian/meta/types/date.py index dcadef5..7094fc3 100644 --- a/src/librarian/meta/types/date.py +++ b/src/librarian/meta/types/date.py @@ -1,3 +1,6 @@ +from datetime import date +import re +import time from .base import MetaValue @@ -18,13 +21,10 @@ class DateValue(MetaValue): """ try: # check out the "N. poł X w." syntax - if isinstance(text, six.binary_type): - text = text.decode("utf-8") - century_format = ( - u"(?:([12]) *poł[.]? +)?([MCDXVI]+) *w[.,]*(?: *l[.]? *([0-9]+))?" + "(?:([12]) *poł[.]? +)?([MCDXVI]+) *w[.,]*(?: *l[.]? *([0-9]+))?" ) - vague_format = u"(?:po *|ok. *)?([0-9]{4})(-[0-9]{2}-[0-9]{2})?" + vague_format = "(?:po *|ok. *)?([0-9]{4})(-[0-9]{2}-[0-9]{2})?" m = re.match(century_format, text) m2 = re.match(vague_format, text) diff --git a/src/librarian/meta/types/wluri.py b/src/librarian/meta/types/wluri.py index c31d391..45bf23c 100644 --- a/src/librarian/meta/types/wluri.py +++ b/src/librarian/meta/types/wluri.py @@ -1,3 +1,4 @@ +import re from .base import MetaValue diff --git a/src/librarian/parser.py b/src/librarian/parser.py index 5e2fb08..bea67b1 100644 --- a/src/librarian/parser.py +++ b/src/librarian/parser.py @@ -181,7 +181,7 @@ class WLDocument(object): for part_uri in self.book_info.parts: try: yield self.from_file( - self.provider.by_uri(part_uri), provider=self.provider + self.provider.by_slug(part_uri.slug), provider=self.provider ) except Exception as e: if pass_part_errors: diff --git a/src/librarian/pdf.py b/src/librarian/pdf.py index 7b93997..b32395f 100644 --- a/src/librarian/pdf.py +++ b/src/librarian/pdf.py @@ -425,7 +425,7 @@ def load_including_children(wldoc=None, provider=None, uri=None): """ if uri and provider: - f = provider.by_uri(uri) + f = provider.by_slug(uri.slug) text = f.read().decode('utf-8') f.close() elif wldoc is not None: