merging master

author Marcin Koziej <marcin@lolownia.org>

Sat, 7 Dec 2013 17:17:06 +0000 (18:17 +0100)

committer Marcin Koziej <marcin@lolownia.org>

Sat, 7 Dec 2013 17:17:06 +0000 (18:17 +0100)
author Marcin Koziej <marcin@lolownia.org>
Sat, 7 Dec 2013 17:17:06 +0000 (18:17 +0100)
committer Marcin Koziej <marcin@lolownia.org>
Sat, 7 Dec 2013 17:17:06 +0000 (18:17 +0100)
diff --combined librarian/dcparser.py

index 7418f70,a690e8f..a7215a1
--- 1/librarian/dcparser.py
--- 2/librarian/dcparser.py
+++ b/librarian/dcparser.py
@@@ -10,12 -10,19 +10,19 @@@ import r
   from librarian.util import roman_to_int
   
   from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
-                        WLURI)
+                        XMLNS, WLURI)
   
   import lxml.etree as etree # ElementTree API using libxml2
   from lxml.etree import XMLSyntaxError
   
   
+ class TextPlus(unicode):
+     pass
+ 
+ class DatePlus(date):
+     pass
+ 
+ 
   # ==============
   # = Converters =
   # ==============
@@@ -63,26 -70,46 +70,47 @@@ class Person(object)
           return 'Person(last_name=%r, first_names=*%r)' % (self.last_name, self.first_names)
   
   def as_date(text):
+     """Dates for digitization of pictures. It seems we need the following:
+ ranges:               '1350-1450',
+ centuries:    "XVIII w.'
+ half centuries/decades: '2 poł. XVIII w.', 'XVII w., l. 20'
+ later-then: 'po 1450'
+ circa 'ok. 1813-1814', 'ok.1876-ok.1886
+ turn: 1893/1894
+ for now we will translate this to some single date losing information of course.
+     """
       try:
           # check out the "N. poł X w." syntax
           if isinstance(text, str): text = text.decode("utf-8")
-         m = re.match(u"(?:([12]) *poł[.]? )?([MCDXVI]+) *w[.]?", text)
++
+         century_format = u"(?:([12]) *poł[.]? +)?([MCDXVI]+) *w[.,]*(?: *l[.]? *([0-9]+))?"
+         vague_format = u"(?:po *|ok. *)?([0-9]{4})(-[0-9]{2}-[0-9]{2})?"
+ 
+         m = re.match(century_format, text)
+         m2 = re.match(vague_format, text)
           if m:
-             
-             half = m.groups()[0]
-             if half is not None: 
+             half = m.group(1)
+             decade = m.group(3)
+             century = roman_to_int(str(m.group(2)))
+             if half is not None:
+                 if decade is not None:
+                     raise ValueError("Bad date format. Cannot specify both half and decade of century")
                   half = int(half)
-             else: 
-                 half = 1
-             century = roman_to_int(str(m.groups()[1]))
-             t = ((century*100 + (half-1)*50), 1, 1)
+                 t = ((century*100 + (half-1)*50), 1, 1)
+             else:
+                 decade = int(decade or 0)
+                 t = ((century*100 + decade), 1, 1)
+         elif m2:
+             year = m2.group(1)
+             mon_day = m2.group(2)
+             if mon_day:
+                 t = time.strptime(year + mon_day, "%Y-%m-%d")
+             else:
+                 t = time.strptime(year, '%Y')
           else:
-             text = re.sub(r"(po|ok[.]?) *", "", text)
-             try:
-                 t = time.strptime(text, '%Y-%m-%d')
-             except ValueError:
-                 t = time.strptime(re.split(r'[-/]', text)[0], '%Y')
-         return date(t[0], t[1], t[2])
+             raise ValueError
+ 
+         return DatePlus(t[0], t[1], t[2])
       except ValueError, e:
           raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.")
   
@@@ -93,7 -120,7 +121,7 @@@ def as_unicode(text)
       if isinstance(text, unicode):
           return text
       else:
-         return text.decode('utf-8')
+         return TextPlus(text.decode('utf-8'))
   
   def as_wluri_strict(text):
       return WLURI.strict(text)
@@@ -119,7 -146,15 +147,15 @@@ class Field(object)
               if self.multiple:
                   if validator is None:
                       return val
-                 return [ validator(v) if v is not None else v for v in val ]
+                 new_values = []
+                 for v in val:
+                     nv = v
+                     if v is not None:
+                         nv = validator(v)
+                         if hasattr(v, 'lang'):
+                             setattr(nv, 'lang', v.lang)
+                     new_values.append(nv)
+                 return new_values
               elif len(val) > 1:
                   raise ValidationError("Multiple values not allowed for field '%s'" % self.uri)
               elif len(val) == 0:
@@@ -127,7 -162,10 +163,10 @@@
               else:
                   if validator is None or val[0] is None:
                       return val[0]
-                 return validator(val[0])
+                 nv = validator(val[0])
+                 if hasattr(val[0], 'lang'):
+                     setattr(nv, 'lang', val[0].lang)
+                 return nv
           except ValueError, e:
               raise ValidationError("Field '%s' - invald value: %s" % (self.uri, e.message))
   
@@@ -247,9 -285,23 +286,23 @@@ class WorkInfo(object)
           if desc is None:
               raise NoDublinCore("No DublinCore section found.")
   
+         lang = None
+         p = desc
+         while p is not None and lang is None:
+             lang = p.attrib.get(XMLNS('lang'))
+             p = p.getparent()
+ 
           for e in desc.getchildren():
               fv = field_dict.get(e.tag, [])
-             fv.append(e.text)
+             if e.text is not None:
+                 text = e.text
+                 if not isinstance(text, unicode):
+                     text = text.decode('utf-8')
+                 val = TextPlus(text)
+                 val.lang = e.attrib.get(XMLNS('lang'), lang)
+             else:
+                 val = e.text
+             fv.append(val)
               field_dict[e.tag] = fv
   
           return cls(desc.attrib, field_dict, *args, **kwargs)
author	Marcin Koziej <marcin@lolownia.org>
	Sat, 7 Dec 2013 17:17:06 +0000 (18:17 +0100)
committer	Marcin Koziej <marcin@lolownia.org>
	Sat, 7 Dec 2013 17:17:06 +0000 (18:17 +0100)