X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/121ab507bfffe249087df81a6dcdc368d9cb11f5..88f39c4f90a67164b2c3da7c7a41df420f3b17ab:/src/librarian/dcparser.py diff --git a/src/librarian/dcparser.py b/src/librarian/dcparser.py index fe4b3fd..910f5e1 100644 --- a/src/librarian/dcparser.py +++ b/src/librarian/dcparser.py @@ -1,32 +1,31 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# Copyright © Fundacja Wolne Lektury. See NOTICE for more information. # from xml.parsers.expat import ExpatError from datetime import date +import io import time import re -import six from librarian.util import roman_to_int from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS, - XMLNS, WLURI, WLNS, PLMETNS) + XMLNS, WLNS, PLMETNS) import lxml.etree as etree from lxml.etree import XMLSyntaxError from librarian.meta.types.bool import BoolValue -from librarian.meta.types.date import DateValue from librarian.meta.types.person import Person -from librarian.meta.types.text import TextValue +from librarian.meta.types.wluri import WLURI +from librarian.meta.types import text -class Field(object): - def __init__(self, uri, attr_name, validator=TextValue, strict=None, +class Field: + def __init__(self, uri, attr_name, value_type=text.TextValue, multiple=False, salias=None, **kwargs): self.uri = uri self.name = attr_name - self.validator = validator - self.strict = strict + self.value_type = value_type self.multiple = multiple self.salias = salias @@ -35,24 +34,12 @@ class Field(object): self.default = kwargs.get('default', [] if multiple else [None]) def validate_value(self, val, strict=False): - if strict and self.strict is not None: - validator = self.strict - else: - validator = self.validator + #if strict: + # value.validate() + try: if self.multiple: - if validator is None: - return val - new_values = [] - for v in val: - nv = v - if v is not None: - #nv = validator(v) - nv = v - if hasattr(v, 'lang'): - setattr(nv, 'lang', v.lang) - new_values.append(nv) - return new_values + return val elif len(val) > 1: raise ValidationError( "Multiple values not allowed for field '%s'" % self.uri @@ -63,13 +50,7 @@ class Field(object): % self.uri ) else: - if validator is None or val[0] is None: - return val[0] - #nv = validator(val[0]) - nv = val[0] - if hasattr(val[0], 'lang') and not hasattr(validator, 'no_lang'): - setattr(nv, 'lang', val[0].lang) - return nv + return val[0] except ValueError as e: raise ValidationError( "Field '%s' - invald value: %s" @@ -122,7 +103,7 @@ class DCInfo(type): return super(DCInfo, mcs).__new__(mcs, classname, bases, class_dict) -class WorkInfo(six.with_metaclass(DCInfo, object)): +class WorkInfo(metaclass=DCInfo): FIELDS = ( Field(DCNS('creator'), 'authors', Person, salias='author', multiple=True), @@ -139,7 +120,7 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): Field(DCNS('contributor.thanks'), 'thanks', required=False), Field(DCNS('date'), 'created_at'), - Field(DCNS('date.pd'), 'released_to_public_domain_at', DateValue, + Field(DCNS('date.pd'), 'released_to_public_domain_at', required=False), Field(DCNS('publisher'), 'publisher', multiple=True), @@ -169,7 +150,7 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): @classmethod def from_bytes(cls, xml, *args, **kwargs): - return cls.from_file(six.BytesIO(xml), *args, **kwargs) + return cls.from_file(io.BytesIO(xml), *args, **kwargs) @classmethod def from_file(cls, xmlfile, *args, **kwargs): @@ -219,25 +200,25 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): p = p.getparent() for e in desc.getchildren(): - field = cls.get_field_by_uri(e.tag) + tag = e.tag + if tag == 'meta': + meta_id = e.attrib.get('id') + if meta_id and meta_id.endswith('-id'): + tag = meta_id + + field = cls.get_field_by_uri(tag) if field is None: # Ignore unknown fields. - ### TODO: does it do for isbn? continue - fv = field_dict.get(e.tag, []) + fv = field_dict.get(tag, []) if e.text is not None: - val = field.validator(e.text) + val = field.value_type.from_text(e.text) val.lang = e.attrib.get(XMLNS('lang'), lang) - - if e.tag == 'meta': - meta_id = e.attrib.get('id') - if meta_id and meta_id.endswith('-id'): - field_dict[meta_id] = [val.replace('ISBN-', 'ISBN ')] else: val = e.text fv.append(val) - field_dict[e.tag] = fv + field_dict[tag] = fv return cls(desc.attrib, field_dict, *args, **kwargs) @@ -320,11 +301,11 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): for x in v: e = etree.Element(field.uri) if x is not None: - e.text = six.text_type(x) + e.text = str(x) description.append(e) else: e = etree.Element(field.uri) - e.text = six.text_type(v) + e.text = str(v) description.append(e) return root @@ -339,9 +320,9 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): if field.multiple: if len(v) == 0: continue - v = [six.text_type(x) for x in v if x is not None] + v = [str(x) for x in v if x is not None] else: - v = six.text_type(v) + v = str(v) dc[field.name] = {'uri': field.uri, 'value': v} rdf['fields'] = dc @@ -356,32 +337,33 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): if field.multiple: if len(v) == 0: continue - v = [six.text_type(x) for x in v if x is not None] + v = [str(x) for x in v if x is not None] else: - v = six.text_type(v) + v = str(v) result[field.name] = v if field.salias: v = getattr(self, field.salias) if v is not None: - result[field.salias] = six.text_type(v) + result[field.salias] = str(v) return result class BookInfo(WorkInfo): FIELDS = ( - Field(DCNS('audience'), 'audiences', salias='audience', multiple=True, + Field(DCNS('audience'), 'audiences', text.Audience, salias='audience', multiple=True, required=False), - Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True, + Field(DCNS('subject.period'), 'epochs', text.Epoch, salias='epoch', multiple=True, required=False), - Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True, + Field(DCNS('subject.type'), 'kinds', text.Kind, salias='kind', multiple=True, required=False), - Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True, + Field(DCNS('subject.genre'), 'genres', text.Genre, salias='genre', multiple=True, required=False), - Field(WLNS('category.legimi'), 'legimi', required=False), - + Field('category.legimi', 'legimi', text.LegimiCategory, required=False), + Field('category.thema.main', 'thema_main', text.MainThemaCategory, required=False), + Field('category.thema', 'thema', text.ThemaCategory, required=False, multiple=True), Field(DCNS('subject.location'), 'location', required=False), Field(DCNS('contributor.translator'), 'translators',