X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/5f2702eda7a1b36f4d29658b5468b6b78745218c..5b1dcc7d247996752fa566c7150a45037b068565:/librarian/dcparser.py diff --git a/librarian/dcparser.py b/librarian/dcparser.py index 5a571ec..7418f70 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -6,6 +6,8 @@ from xml.parsers.expat import ExpatError from datetime import date import time +import re +from librarian.util import roman_to_int from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS, WLURI) @@ -62,10 +64,24 @@ class Person(object): def as_date(text): try: - try: - t = time.strptime(text, '%Y-%m-%d') - except ValueError: - t = time.strptime(text, '%Y') + # check out the "N. poł X w." syntax + if isinstance(text, str): text = text.decode("utf-8") + m = re.match(u"(?:([12]) *poł[.]? )?([MCDXVI]+) *w[.]?", text) + if m: + + half = m.groups()[0] + if half is not None: + half = int(half) + else: + half = 1 + century = roman_to_int(str(m.groups()[1])) + t = ((century*100 + (half-1)*50), 1, 1) + else: + text = re.sub(r"(po|ok[.]?) *", "", text) + try: + t = time.strptime(text, '%Y-%m-%d') + except ValueError: + t = time.strptime(re.split(r'[-/]', text)[0], '%Y') return date(t[0], t[1], t[2]) except ValueError, e: raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.") @@ -115,10 +131,21 @@ class Field(object): except ValueError, e: raise ValidationError("Field '%s' - invald value: %s" % (self.uri, e.message)) - def validate(self, fdict, strict=False): + def validate(self, fdict, fallbacks=None, strict=False): + if fallbacks is None: + fallbacks = {} if not fdict.has_key(self.uri): if not self.required: - f = self.default + # Accept single value for single fields and saliases. + if self.name in fallbacks: + if self.multiple: + f = fallbacks[self.name] + else: + f = [fallbacks[self.name]] + elif self.salias and self.salias in fallbacks: + f = [fallbacks[self.salias]] + else: + f = self.default else: raise ValidationError("Required field %s not found" % self.uri) else: @@ -160,6 +187,9 @@ class WorkInfo(object): as_person, salias='editor', multiple=True, default=[]), Field( DCNS('contributor.technical_editor'), 'technical_editors', as_person, salias='technical_editor', multiple=True, default=[]), + Field( DCNS('contributor.funding'), 'funders', + salias='funder', multiple=True, default=[]), + Field( DCNS('contributor.thanks'), 'thanks', required=False), Field( DCNS('date'), 'created_at', as_date), Field( DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), @@ -224,7 +254,7 @@ class WorkInfo(object): return cls(desc.attrib, field_dict, *args, **kwargs) - def __init__(self, rdf_attrs, dc_fields, strict=False): + def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False): """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description. dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the given field. """ @@ -233,7 +263,8 @@ class WorkInfo(object): self.fmap = {} for field in self.FIELDS: - value = field.validate(dc_fields, strict=strict) + value = field.validate(dc_fields, fallbacks=fallbacks, + strict=strict) setattr(self, 'prop_' + field.name, value) self.fmap[field.name] = field if field.salias: self.fmap[field.salias] = field