Basic biblical tools.
[librarian.git] / src / librarian / dcparser.py
index fe4b3fd..ce03be2 100644 (file)
@@ -1,32 +1,31 @@
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+# Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
 #
 from xml.parsers.expat import ExpatError
 from datetime import date
 #
 from xml.parsers.expat import ExpatError
 from datetime import date
+import io
 import time
 import re
 import time
 import re
-import six
 from librarian.util import roman_to_int
 
 from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
 from librarian.util import roman_to_int
 
 from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
-                       XMLNS, WLURI, WLNS, PLMETNS)
+                       XMLNS, WLNS, PLMETNS)
 
 import lxml.etree as etree
 from lxml.etree import XMLSyntaxError
 
 from librarian.meta.types.bool import BoolValue
 
 import lxml.etree as etree
 from lxml.etree import XMLSyntaxError
 
 from librarian.meta.types.bool import BoolValue
-from librarian.meta.types.date import DateValue
 from librarian.meta.types.person import Person
 from librarian.meta.types.person import Person
-from librarian.meta.types.text import TextValue
+from librarian.meta.types.wluri import WLURI
+from librarian.meta.types import text
 
 
 
 
-class Field(object):
-    def __init__(self, uri, attr_name, validator=TextValue, strict=None,
+class Field:
+    def __init__(self, uri, attr_name, value_type=text.TextValue,
                  multiple=False, salias=None, **kwargs):
         self.uri = uri
         self.name = attr_name
                  multiple=False, salias=None, **kwargs):
         self.uri = uri
         self.name = attr_name
-        self.validator = validator
-        self.strict = strict
+        self.value_type = value_type
         self.multiple = multiple
         self.salias = salias
 
         self.multiple = multiple
         self.salias = salias
 
@@ -35,24 +34,12 @@ class Field(object):
         self.default = kwargs.get('default', [] if multiple else [None])
 
     def validate_value(self, val, strict=False):
         self.default = kwargs.get('default', [] if multiple else [None])
 
     def validate_value(self, val, strict=False):
-        if strict and self.strict is not None:
-            validator = self.strict
-        else:
-            validator = self.validator
+        #if strict:
+        #    value.validate()
+
         try:
             if self.multiple:
         try:
             if self.multiple:
-                if validator is None:
-                    return val
-                new_values = []
-                for v in val:
-                    nv = v
-                    if v is not None:
-                        #nv = validator(v)
-                        nv = v
-                        if hasattr(v, 'lang'):
-                            setattr(nv, 'lang', v.lang)
-                    new_values.append(nv)
-                return new_values
+                return val
             elif len(val) > 1:
                 raise ValidationError(
                     "Multiple values not allowed for field '%s'" % self.uri
             elif len(val) > 1:
                 raise ValidationError(
                     "Multiple values not allowed for field '%s'" % self.uri
@@ -63,13 +50,7 @@ class Field(object):
                     % self.uri
                 )
             else:
                     % self.uri
                 )
             else:
-                if validator is None or val[0] is None:
-                    return val[0]
-                #nv = validator(val[0])
-                nv = val[0]
-                if hasattr(val[0], 'lang') and not hasattr(validator, 'no_lang'):
-                    setattr(nv, 'lang', val[0].lang)
-                return nv
+                return val[0]
         except ValueError as e:
             raise ValidationError(
                 "Field '%s' - invald value: %s"
         except ValueError as e:
             raise ValidationError(
                 "Field '%s' - invald value: %s"
@@ -106,23 +87,7 @@ class Field(object):
         return False
 
 
         return False
 
 
-class DCInfo(type):
-    def __new__(mcs, classname, bases, class_dict):
-        fields = list(class_dict['FIELDS'])
-
-        for base in bases[::-1]:
-            if hasattr(base, 'FIELDS'):
-                for field in base.FIELDS[::-1]:
-                    try:
-                        fields.index(field)
-                    except ValueError:
-                        fields.insert(0, field)
-
-        class_dict['FIELDS'] = tuple(fields)
-        return super(DCInfo, mcs).__new__(mcs, classname, bases, class_dict)
-
-
-class WorkInfo(six.with_metaclass(DCInfo, object)):
+class BookInfo:
     FIELDS = (
         Field(DCNS('creator'), 'authors', Person, salias='author',
               multiple=True),
     FIELDS = (
         Field(DCNS('creator'), 'authors', Person, salias='author',
               multiple=True),
@@ -139,7 +104,7 @@ class WorkInfo(six.with_metaclass(DCInfo, object)):
         Field(DCNS('contributor.thanks'), 'thanks', required=False),
 
         Field(DCNS('date'), 'created_at'),
         Field(DCNS('contributor.thanks'), 'thanks', required=False),
 
         Field(DCNS('date'), 'created_at'),
-        Field(DCNS('date.pd'), 'released_to_public_domain_at', DateValue,
+        Field(DCNS('date.pd'), 'released_to_public_domain_at',
               required=False),
         Field(DCNS('publisher'), 'publisher', multiple=True),
 
               required=False),
         Field(DCNS('publisher'), 'publisher', multiple=True),
 
@@ -159,6 +124,48 @@ class WorkInfo(six.with_metaclass(DCInfo, object)):
         Field(WLNS('contentWarning'), 'content_warnings', multiple=True,
               required=False),
         Field(WLNS('developmentStage'), 'stage', required=False),
         Field(WLNS('contentWarning'), 'content_warnings', multiple=True,
               required=False),
         Field(WLNS('developmentStage'), 'stage', required=False),
+
+        Field(DCNS('audience'), 'audiences', text.Audience, salias='audience', multiple=True,
+              required=False),
+
+        Field(DCNS('subject.period'), 'epochs', text.Epoch, salias='epoch', multiple=True,
+              required=False),
+        Field(DCNS('subject.type'), 'kinds', text.Kind, salias='kind', multiple=True,
+              required=False),
+        Field(DCNS('subject.genre'), 'genres', text.Genre, salias='genre', multiple=True,
+              required=False),
+        Field('category.legimi', 'legimi', text.LegimiCategory, required=False),
+        Field('category.thema.main', 'thema_main', text.MainThemaCategory, required=False),
+        Field('category.thema', 'thema', text.ThemaCategory, required=False, multiple=True),
+        Field(DCNS('subject.location'), 'location', required=False),
+
+        Field(DCNS('contributor.translator'), 'translators',
+              Person,  salias='translator', multiple=True, required=False),
+        Field(DCNS('relation.hasPart'), 'parts', WLURI,
+              multiple=True, required=False),
+        Field(DCNS('relation.isVariantOf'), 'variant_of', WLURI,
+              required=False),
+
+        Field(DCNS('relation.coverImage.url'), 'cover_url', required=False),
+        Field(DCNS('relation.coverImage.attribution'), 'cover_by',
+              required=False),
+        Field(DCNS('relation.coverImage.source'), 'cover_source',
+              required=False),
+        # WLCover-specific.
+        Field(WLNS('coverBarColor'), 'cover_bar_color', required=False),
+        Field(WLNS('coverBoxPosition'), 'cover_box_position', required=False),
+        Field(WLNS('coverClass'), 'cover_class', default=['default']),
+        Field(WLNS('coverLogoUrl'), 'cover_logo_urls', multiple=True,
+              required=False),
+        Field(WLNS('endnotes'), 'endnotes', BoolValue,
+              required=False),
+
+        Field('pdf-id',  'isbn_pdf',  required=False),
+        Field('epub-id', 'isbn_epub', required=False),
+        Field('mobi-id', 'isbn_mobi', required=False),
+        Field('txt-id',  'isbn_txt',  required=False),
+        Field('html-id', 'isbn_html', required=False),
+
     )
 
     @classmethod
     )
 
     @classmethod
@@ -169,7 +176,7 @@ class WorkInfo(six.with_metaclass(DCInfo, object)):
     
     @classmethod
     def from_bytes(cls, xml, *args, **kwargs):
     
     @classmethod
     def from_bytes(cls, xml, *args, **kwargs):
-        return cls.from_file(six.BytesIO(xml), *args, **kwargs)
+        return cls.from_file(io.BytesIO(xml), *args, **kwargs)
 
     @classmethod
     def from_file(cls, xmlfile, *args, **kwargs):
 
     @classmethod
     def from_file(cls, xmlfile, *args, **kwargs):
@@ -219,25 +226,25 @@ class WorkInfo(six.with_metaclass(DCInfo, object)):
             p = p.getparent()
 
         for e in desc.getchildren():
             p = p.getparent()
 
         for e in desc.getchildren():
-            field = cls.get_field_by_uri(e.tag)
+            tag = e.tag
+            if tag == 'meta':
+                meta_id = e.attrib.get('id')
+                if meta_id and meta_id.endswith('-id'):
+                    tag = meta_id
+
+            field = cls.get_field_by_uri(tag)
             if field is None:
                 # Ignore unknown fields.
             if field is None:
                 # Ignore unknown fields.
-                ### TODO: does it do <meta> for isbn?
                 continue
 
                 continue
 
-            fv = field_dict.get(e.tag, [])
+            fv = field_dict.get(tag, [])
             if e.text is not None:
             if e.text is not None:
-                val = field.validator(e.text)
+                val = field.value_type.from_text(e.text)
                 val.lang = e.attrib.get(XMLNS('lang'), lang)
                 val.lang = e.attrib.get(XMLNS('lang'), lang)
-
-                if e.tag == 'meta':
-                    meta_id = e.attrib.get('id')
-                    if meta_id and meta_id.endswith('-id'):
-                        field_dict[meta_id] = [val.replace('ISBN-', 'ISBN ')]
             else:
                 val = e.text
             fv.append(val)
             else:
                 val = e.text
             fv.append(val)
-            field_dict[e.tag] = fv
+            field_dict[tag] = fv
 
         return cls(desc.attrib, field_dict, *args, **kwargs)
 
 
         return cls(desc.attrib, field_dict, *args, **kwargs)
 
@@ -320,11 +327,11 @@ class WorkInfo(six.with_metaclass(DCInfo, object)):
                     for x in v:
                         e = etree.Element(field.uri)
                         if x is not None:
                     for x in v:
                         e = etree.Element(field.uri)
                         if x is not None:
-                            e.text = six.text_type(x)
+                            e.text = str(x)
                         description.append(e)
                 else:
                     e = etree.Element(field.uri)
                         description.append(e)
                 else:
                     e = etree.Element(field.uri)
-                    e.text = six.text_type(v)
+                    e.text = str(v)
                     description.append(e)
 
         return root
                     description.append(e)
 
         return root
@@ -339,9 +346,9 @@ class WorkInfo(six.with_metaclass(DCInfo, object)):
                 if field.multiple:
                     if len(v) == 0:
                         continue
                 if field.multiple:
                     if len(v) == 0:
                         continue
-                    v = [six.text_type(x) for x in v if x is not None]
+                    v = [str(x) for x in v if x is not None]
                 else:
                 else:
-                    v = six.text_type(v)
+                    v = str(v)
 
                 dc[field.name] = {'uri': field.uri, 'value': v}
         rdf['fields'] = dc
 
                 dc[field.name] = {'uri': field.uri, 'value': v}
         rdf['fields'] = dc
@@ -356,62 +363,18 @@ class WorkInfo(six.with_metaclass(DCInfo, object)):
                 if field.multiple:
                     if len(v) == 0:
                         continue
                 if field.multiple:
                     if len(v) == 0:
                         continue
-                    v = [six.text_type(x) for x in v if x is not None]
+                    v = [str(x) for x in v if x is not None]
                 else:
                 else:
-                    v = six.text_type(v)
+                    v = str(v)
                 result[field.name] = v
 
             if field.salias:
                 v = getattr(self, field.salias)
                 if v is not None:
                 result[field.name] = v
 
             if field.salias:
                 v = getattr(self, field.salias)
                 if v is not None:
-                    result[field.salias] = six.text_type(v)
+                    result[field.salias] = str(v)
 
         return result
 
 
 
         return result
 
 
-class BookInfo(WorkInfo):
-    FIELDS = (
-        Field(DCNS('audience'), 'audiences', salias='audience', multiple=True,
-              required=False),
-
-        Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True,
-              required=False),
-        Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True,
-              required=False),
-        Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True,
-              required=False),
-        Field(WLNS('category.legimi'), 'legimi', required=False),
-
-        Field(DCNS('subject.location'), 'location', required=False),
-
-        Field(DCNS('contributor.translator'), 'translators',
-              Person,  salias='translator', multiple=True, required=False),
-        Field(DCNS('relation.hasPart'), 'parts', WLURI,
-              multiple=True, required=False),
-        Field(DCNS('relation.isVariantOf'), 'variant_of', WLURI,
-              required=False),
-
-        Field(DCNS('relation.coverImage.url'), 'cover_url', required=False),
-        Field(DCNS('relation.coverImage.attribution'), 'cover_by',
-              required=False),
-        Field(DCNS('relation.coverImage.source'), 'cover_source',
-              required=False),
-        # WLCover-specific.
-        Field(WLNS('coverBarColor'), 'cover_bar_color', required=False),
-        Field(WLNS('coverBoxPosition'), 'cover_box_position', required=False),
-        Field(WLNS('coverClass'), 'cover_class', default=['default']),
-        Field(WLNS('coverLogoUrl'), 'cover_logo_urls', multiple=True,
-              required=False),
-        Field(WLNS('endnotes'), 'endnotes', BoolValue,
-              required=False),
-
-        Field('pdf-id',  'isbn_pdf',  required=False),
-        Field('epub-id', 'isbn_epub', required=False),
-        Field('mobi-id', 'isbn_mobi', required=False),
-        Field('txt-id',  'isbn_txt',  required=False),
-        Field('html-id', 'isbn_html', required=False),
-    )
-
-
 def parse(file_name, cls=BookInfo):
     return cls.from_file(file_name)
 def parse(file_name, cls=BookInfo):
     return cls.from_file(file_name)