Merge branch '__images'
authorMarcin Koziej <marcin@koziej.info>
Tue, 5 Nov 2013 09:04:45 +0000 (10:04 +0100)
committerMarcin Koziej <marcin@koziej.info>
Tue, 5 Nov 2013 09:04:45 +0000 (10:04 +0100)
1  2 
librarian/dcparser.py

diff --combined librarian/dcparser.py
@@@ -6,6 -6,8 +6,8 @@@
  from xml.parsers.expat import ExpatError
  from datetime import date
  import time
+ import re
+ from librarian.util import roman_to_int
  
  from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
                         WLURI)
@@@ -62,10 -64,17 +64,17 @@@ class Person(object)
  
  def as_date(text):
      try:
-         try:
-             t = time.strptime(text, '%Y-%m-%d')
-         except ValueError:
-             t = time.strptime(text, '%Y')
+         # check out the "N. poł X w." syntax
+         m = re.match(u"([12]) *poł[.]? ([MCDXVI]+) .*[.]?", text)
+         if m:
+             half = int(m.groups()[0])
+             century = roman_to_int(str(m.groups()[1]))
+             t = ((century*100 + (half-1)*50), 1, 1)
+         else:
+             try:
+                 t = time.strptime(text, '%Y-%m-%d')
+             except ValueError:
+                 t = time.strptime(text, '%Y')
          return date(t[0], t[1], t[2])
      except ValueError, e:
          raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.")
@@@ -79,63 -88,44 +88,63 @@@ def as_unicode(text)
      else:
          return text.decode('utf-8')
  
 +def as_wluri_strict(text):
 +    return WLURI.strict(text)
 +
  class Field(object):
 -    def __init__(self, uri, attr_name, type=as_unicode, multiple=False, salias=None, **kwargs):
 +    def __init__(self, uri, attr_name, validator=as_unicode, strict=None, multiple=False, salias=None, **kwargs):
          self.uri = uri
          self.name = attr_name
 -        self.validator = type
 +        self.validator = validator
 +        self.strict = strict
          self.multiple = multiple
          self.salias = salias
  
          self.required = kwargs.get('required', True) and not kwargs.has_key('default')
          self.default = kwargs.get('default', [] if multiple else [None])
  
 -    def validate_value(self, val):
 +    def validate_value(self, val, strict=False):
 +        if strict and self.strict is not None:
 +            validator = self.strict
 +        else:
 +            validator = self.validator
          try:
              if self.multiple:
 -                if self.validator is None:
 +                if validator is None:
                      return val
 -                return [ self.validator(v) if v is not None else v for v in val ]
 +                return [ validator(v) if v is not None else v for v in val ]
              elif len(val) > 1:
                  raise ValidationError("Multiple values not allowed for field '%s'" % self.uri)
              elif len(val) == 0:
                  raise ValidationError("Field %s has no value to assign. Check your defaults." % self.uri)
              else:
 -                if self.validator is None or val[0] is None:
 +                if validator is None or val[0] is None:
                      return val[0]
 -                return self.validator(val[0])
 +                return validator(val[0])
          except ValueError, e:
              raise ValidationError("Field '%s' - invald value: %s" % (self.uri, e.message))
  
 -    def validate(self, fdict):
 +    def validate(self, fdict, fallbacks=None, strict=False):
 +        if fallbacks is None:
 +            fallbacks = {}
          if not fdict.has_key(self.uri):
              if not self.required:
 -                f = self.default
 +                # Accept single value for single fields and saliases.
 +                if self.name in fallbacks:
 +                    if self.multiple:
 +                        f = fallbacks[self.name]
 +                    else:
 +                        f = [fallbacks[self.name]]
 +                elif self.salias and self.salias in fallbacks:
 +                    f = [fallbacks[self.salias]]
 +                else:
 +                    f = self.default
              else:
                  raise ValidationError("Required field %s not found" % self.uri)
          else:
              f = fdict[self.uri]
  
 -        return self.validate_value(f)
 +        return self.validate_value(f, strict=strict)
  
      def __eq__(self, other):
          if isinstance(other, Field) and other.name == self.name:
@@@ -171,9 -161,6 +180,9 @@@ class WorkInfo(object)
              as_person, salias='editor', multiple=True, default=[]),
          Field( DCNS('contributor.technical_editor'), 'technical_editors',
              as_person, salias='technical_editor', multiple=True, default=[]),
 +        Field( DCNS('contributor.funding'), 'funders',
 +            salias='funder', multiple=True, default=[]),
 +        Field( DCNS('contributor.thanks'), 'thanks', required=False),
  
          Field( DCNS('date'), 'created_at', as_date),
          Field( DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False),
  
          Field( DCNS('source'), 'source_name', required=False),
          Field( DCNS('source.URL'), 'source_url', required=False),
 -        Field( DCNS('identifier.url'), 'url', WLURI),
 +        Field( DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict),
          Field( DCNS('rights.license'), 'license', required=False),
          Field( DCNS('rights'), 'license_description'),
      )
  
      @classmethod
 -    def from_string(cls, xml):
 +    def from_string(cls, xml, *args, **kwargs):
          from StringIO import StringIO
 -        return cls.from_file(StringIO(xml))
 +        return cls.from_file(StringIO(xml), *args, **kwargs)
  
      @classmethod
 -    def from_file(cls, xmlfile):
 +    def from_file(cls, xmlfile, *args, **kwargs):
          desc_tag = None
          try:
              iter = etree.iterparse(xmlfile, ['start', 'end'])
              # if there is no end, Expat should yell at us with an ExpatError
  
              # extract data from the element and make the info
 -            return cls.from_element(desc_tag)
 +            return cls.from_element(desc_tag, *args, **kwargs)
          except XMLSyntaxError, e:
              raise ParseError(e)
          except ExpatError, e:
              raise ParseError(e)
  
      @classmethod
 -    def from_element(cls, rdf_tag):
 +    def from_element(cls, rdf_tag, *args, **kwargs):
          # the tree is already parsed, so we don't need to worry about Expat errors
          field_dict = {}
          desc = rdf_tag.find(".//" + RDFNS('Description'))
              fv.append(e.text)
              field_dict[e.tag] = fv
  
 -        return cls(desc.attrib, field_dict)
 +        return cls(desc.attrib, field_dict, *args, **kwargs)
  
 -    def __init__(self, rdf_attrs, dc_fields):
 +    def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False):
          """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description.
          dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the
          given field. """
          self.fmap = {}
  
          for field in self.FIELDS:
 -            value = field.validate( dc_fields )
 +            value = field.validate(dc_fields, fallbacks=fallbacks,
 +                            strict=strict)
              setattr(self, 'prop_' + field.name, value)
              self.fmap[field.name] = field
              if field.salias: self.fmap[field.salias] = field
  
 -        self.validate()
 -
 -    def validate(self):
 -        self.url.validate_language(self.language)
 -
      def __getattribute__(self, name):
          try:
              field = object.__getattribute__(self, 'fmap')[name]
                  if not field.multiple:
                      raise "OUCH!! for field %s" % name
  
 -                return value[0]
 +                return value[0] if value else None
          except (KeyError, AttributeError):
              return object.__getattribute__(self, name)
  
@@@ -362,23 -353,17 +371,23 @@@ class BookInfo(WorkInfo)
          Field( DCNS('audience'), 'audiences', salias='audience', multiple=True,
                  required=False),
  
 -        Field( DCNS('subject.period'), 'epochs', salias='epoch', multiple=True),
 -        Field( DCNS('subject.type'), 'kinds', salias='kind', multiple=True),
 -        Field( DCNS('subject.genre'), 'genres', salias='genre', multiple=True),
 +        Field( DCNS('subject.period'), 'epochs', salias='epoch', multiple=True,
 +                required=False),
 +        Field( DCNS('subject.type'), 'kinds', salias='kind', multiple=True,
 +                required=False),
 +        Field( DCNS('subject.genre'), 'genres', salias='genre', multiple=True,
 +                required=False),
                  
          Field( DCNS('contributor.translator'), 'translators', \
              as_person,  salias='translator', multiple=True, default=[]),
 -        Field( DCNS('relation.hasPart'), 'parts', WLURI, multiple=True, required=False),
 -
 -        Field( DCNS('relation.cover_image.url'), 'cover_url', required=False),
 -        Field( DCNS('relation.cover_image.attribution'), 'cover_by', required=False),
 -        Field( DCNS('relation.cover_image.source'), 'cover_source', required=False),
 +        Field( DCNS('relation.hasPart'), 'parts', 
 +            WLURI, strict=as_wluri_strict, multiple=True, required=False),
 +        Field( DCNS('relation.isVariantOf'), 'variant_of', 
 +            WLURI, strict=as_wluri_strict, required=False),
 +
 +        Field( DCNS('relation.coverImage.url'), 'cover_url', required=False),
 +        Field( DCNS('relation.coverImage.attribution'), 'cover_by', required=False),
 +        Field( DCNS('relation.coverImage.source'), 'cover_source', required=False),
      )