fixes for dates etc
authorMarcin Koziej <marcin@lolownia.org>
Wed, 4 Dec 2013 18:45:25 +0000 (19:45 +0100)
committerMarcin Koziej <marcin@lolownia.org>
Wed, 4 Dec 2013 18:45:25 +0000 (19:45 +0100)
librarian/dcparser.py
librarian/picture.py

index 079c434..7418f70 100644 (file)
@@ -66,16 +66,22 @@ def as_date(text):
     try:
         # check out the "N. poł X w." syntax
         if isinstance(text, str): text = text.decode("utf-8")
     try:
         # check out the "N. poł X w." syntax
         if isinstance(text, str): text = text.decode("utf-8")
-        m = re.match(u"([12]) *poł[.]? ([MCDXVI]+) *w[.]?", text)
+        m = re.match(u"(?:([12]) *poł[.]? )?([MCDXVI]+) *w[.]?", text)
         if m:
         if m:
-            half = int(m.groups()[0])
+            
+            half = m.groups()[0]
+            if half is not None: 
+                half = int(half)
+            else: 
+                half = 1
             century = roman_to_int(str(m.groups()[1]))
             t = ((century*100 + (half-1)*50), 1, 1)
         else:
             century = roman_to_int(str(m.groups()[1]))
             t = ((century*100 + (half-1)*50), 1, 1)
         else:
+            text = re.sub(r"(po|ok[.]?) *", "", text)
             try:
                 t = time.strptime(text, '%Y-%m-%d')
             except ValueError:
             try:
                 t = time.strptime(text, '%Y-%m-%d')
             except ValueError:
-                t = time.strptime(text, '%Y')
+                t = time.strptime(re.split(r'[-/]', text)[0], '%Y')
         return date(t[0], t[1], t[2])
     except ValueError, e:
         raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.")
         return date(t[0], t[1], t[2])
     except ValueError, e:
         raise ValueError("Unrecognized date format. Try YYYY-MM-DD or YYYY.")
index 32ffe09..7b98ff1 100644 (file)
@@ -5,9 +5,10 @@ from xml.parsers.expat import ExpatError
 from os import path
 from StringIO import StringIO
 from lxml import etree
 from os import path
 from StringIO import StringIO
 from lxml import etree
-from lxml.etree import (XMLSyntaxError, XSLTApplyError)
+from lxml.etree import (XMLSyntaxError, XSLTApplyError, Element)
 import re
 import re
-
+from functools import *
+from operator import *
 
 class WLPictureURI(WLURI):
     _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/'
 
 class WLPictureURI(WLURI):
     _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/'
@@ -125,7 +126,9 @@ class WLPicture(object):
             parser = etree.XMLParser(remove_blank_text=False)
             tree = etree.parse(StringIO(data.encode('utf-8')), parser)
 
             parser = etree.XMLParser(remove_blank_text=False)
             tree = etree.parse(StringIO(data.encode('utf-8')), parser)
 
-            return cls(tree, parse_dublincore=parse_dublincore, image_store=image_store)
+            me =  cls(tree, parse_dublincore=parse_dublincore, image_store=image_store)
+            me.load_frame_info()
+            return me
         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
             raise ParseError(e)
 
         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
             raise ParseError(e)
 
@@ -143,32 +146,62 @@ class WLPicture(object):
     def image_path(self):
         if self.image_store is None:
             raise ValueError("No image store associated with whis WLPicture.")
     def image_path(self):
         if self.image_store is None:
             raise ValueError("No image store associated with whis WLPicture.")
+
         return self.image_store.path(self.slug, self.mime_type)
 
     def image_file(self, *args, **kwargs):
         return open(self.image_path, *args, **kwargs)
 
         return self.image_store.path(self.slug, self.mime_type)
 
     def image_file(self, *args, **kwargs):
         return open(self.image_path, *args, **kwargs)
 
+    def get_sem_coords(self, sem):
+        area = sem.find("div[@type='rect']")
+        if area is None:
+            area = sem.find("div[@type='whole']")
+            return ((0, 0), (-1, -1))
+
+        def has_all_props(node, props):
+            return reduce(and_, map(lambda prop: prop in node.attrib, props))
+
+        if has_all_props(area,  ['x1', 'x2', 'y1', 'y2']) == False:
+            return None
+            
+        def n(prop): return int(area.get(prop))
+        return ((n('x1'), n('y1')), (n('x2'), n('y2')))
+        
+
     def partiter(self):
         """
         Iterates the parts of this picture and returns them and their metadata
         """
     def partiter(self):
         """
         Iterates the parts of this picture and returns them and their metadata
         """
-        for part in self.edoc.iter("div"):
+        # omg no support for //sem[(@type='theme') or (@type='object')] ?
+        for part in list(self.edoc.iterfind("//sem[@type='theme']")) + list(self.edoc.iterfind("//sem[@type='object']")):
             pd = {}
             pd['type'] = part.get('type')
             pd = {}
             pd['type'] = part.get('type')
-            if pd['type'] == 'area':
-                pd['coords'] = ((int(part.get('x1')), int(part.get('y1'))),
-                                (int(part.get('x2')), int(part.get('y2'))))
-
-            pd['themes'] = []
-            pd['object'] = None
-            parent = part
-            while True:
-                parent = parent.getparent()
-                if parent is None:
-                    break
-                if parent.tag == 'sem':
-                    if parent.get('type') == 'theme':
-                        pd['themes'] += map(unicode.strip, unicode(parent.get('theme')).split(','))
-                    elif parent.get('type') == 'object' and pd['object'] is None:
-                        pd['object'] = parent.get('object')
+
+            coords = self.get_sem_coords(part)
+            if coords is None: continue
+            pd['coords'] = coords
+
+            def want_unicode(x):
+                if not isinstance(x, unicode): return x.decode('utf-8')
+                else: return x
+            pd['object'] = part.attrib['type'] == 'object' and want_unicode(part.attrib.get('object', u'')) or None
+            pd['themes'] = part.attrib['type'] == 'theme' and [part.attrib.get('theme', u'')] or []
             yield pd
             yield pd
+
+    def load_frame_info(self):
+        k = self.edoc.find("//sem[@object='kadr']")
+        
+        if k is not None:
+            clip = self.get_sem_coords(k)
+            self.frame = clip
+            frm = Element("sem", {"type": "frame"})
+            frm.append(k.iter("div").next())
+            self.edoc.getroot().append(frm)
+            k.getparent().remove(k)
+        else:
+            frm = self.edoc.find("//sem[@type='frame']")
+            if frm:
+                self.frame = self.get_sem_coords(frm)
+            else:
+                self.frame = None
+        return self