b89abd1e5a9c63c4b12fcbdae60bdf70c9f0f986
[librarian.git] / src / librarian / dcparser.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from xml.parsers.expat import ExpatError
5 from datetime import date
6 import time
7 import re
8 import six
9 from librarian.util import roman_to_int
10
11 from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
12                        XMLNS, WLNS, PLMETNS)
13
14 import lxml.etree as etree
15 from lxml.etree import XMLSyntaxError
16
17 from librarian.meta.types.bool import BoolValue
18 from librarian.meta.types.date import DateValue
19 from librarian.meta.types.person import Person
20 from librarian.meta.types.wluri import WLURI
21 from librarian.meta.types import text
22
23
24 class Field(object):
25     def __init__(self, uri, attr_name, value_type=text.TextValue,
26                  multiple=False, salias=None, **kwargs):
27         self.uri = uri
28         self.name = attr_name
29         self.value_type = value_type
30         self.multiple = multiple
31         self.salias = salias
32
33         self.required = (kwargs.get('required', True)
34                          and 'default' not in kwargs)
35         self.default = kwargs.get('default', [] if multiple else [None])
36
37     def validate_value(self, val, strict=False):
38         #if strict:
39         #    value.validate()
40
41         try:
42             if self.multiple:
43                 return val
44             elif len(val) > 1:
45                 raise ValidationError(
46                     "Multiple values not allowed for field '%s'" % self.uri
47                 )
48             elif len(val) == 0:
49                 raise ValidationError(
50                     "Field %s has no value to assign. Check your defaults."
51                     % self.uri
52                 )
53             else:
54                 return val[0]
55         except ValueError as e:
56             raise ValidationError(
57                 "Field '%s' - invald value: %s"
58                 % (self.uri, str(e))
59             )
60
61     def validate(self, fdict, fallbacks=None, strict=False, validate_required=True):
62         if fallbacks is None:
63             fallbacks = {}
64         if self.uri not in fdict:
65             if not self.required:
66                 # Accept single value for single fields and saliases.
67                 if self.name in fallbacks:
68                     if self.multiple:
69                         f = fallbacks[self.name]
70                     else:
71                         f = [fallbacks[self.name]]
72                 elif self.salias and self.salias in fallbacks:
73                     f = [fallbacks[self.salias]]
74                 else:
75                     f = self.default
76             elif validate_required:
77                 raise ValidationError("Required field %s not found" % self.uri)
78             else:
79                 return None
80         else:
81             f = fdict[self.uri]
82
83         return self.validate_value(f, strict=strict)
84
85     def __eq__(self, other):
86         if isinstance(other, Field) and other.name == self.name:
87             return True
88         return False
89
90
91 class DCInfo(type):
92     def __new__(mcs, classname, bases, class_dict):
93         fields = list(class_dict['FIELDS'])
94
95         for base in bases[::-1]:
96             if hasattr(base, 'FIELDS'):
97                 for field in base.FIELDS[::-1]:
98                     try:
99                         fields.index(field)
100                     except ValueError:
101                         fields.insert(0, field)
102
103         class_dict['FIELDS'] = tuple(fields)
104         return super(DCInfo, mcs).__new__(mcs, classname, bases, class_dict)
105
106
107 class WorkInfo(six.with_metaclass(DCInfo, object)):
108     FIELDS = (
109         Field(DCNS('creator'), 'authors', Person, salias='author',
110               multiple=True),
111         Field(DCNS('title'), 'title'),
112         Field(DCNS('type'), 'type', required=False, multiple=True),
113
114         Field(DCNS('contributor.editor'), 'editors',
115               Person, salias='editor', multiple=True, required=False),
116         Field(DCNS('contributor.technical_editor'), 'technical_editors',
117               Person, salias='technical_editor', multiple=True,
118               required=False),
119         Field(DCNS('contributor.funding'), 'funders', salias='funder',
120               multiple=True, required=False),
121         Field(DCNS('contributor.thanks'), 'thanks', required=False),
122
123         Field(DCNS('date'), 'created_at'),
124         Field(DCNS('date.pd'), 'released_to_public_domain_at', DateValue,
125               required=False),
126         Field(DCNS('publisher'), 'publisher', multiple=True),
127
128         Field(DCNS('language'), 'language'),
129         Field(DCNS('description'), 'description', required=False),
130
131         Field(DCNS('source'), 'source_name', required=False),
132         Field(DCNS('source.URL'), 'source_urls', salias='source_url',
133               multiple=True, required=False),
134         Field(DCNS('identifier.url'), 'url', WLURI),
135         Field(DCNS('rights.license'), 'license', required=False),
136         Field(DCNS('rights'), 'license_description'),
137
138         Field(PLMETNS('digitisationSponsor'), 'sponsors', multiple=True,
139               required=False),
140         Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False),
141         Field(WLNS('contentWarning'), 'content_warnings', multiple=True,
142               required=False),
143         Field(WLNS('developmentStage'), 'stage', required=False),
144     )
145
146     @classmethod
147     def get_field_by_uri(cls, uri):
148         for f in cls.FIELDS:
149             if f.uri == uri:
150                 return f
151     
152     @classmethod
153     def from_bytes(cls, xml, *args, **kwargs):
154         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
155
156     @classmethod
157     def from_file(cls, xmlfile, *args, **kwargs):
158         desc_tag = None
159         try:
160             iter = etree.iterparse(xmlfile, ['start', 'end'])
161             for (event, element) in iter:
162                 if element.tag == RDFNS('RDF') and event == 'start':
163                     desc_tag = element
164                     break
165
166             if desc_tag is None:
167                 raise NoDublinCore("DublinCore section not found. \
168                     Check if there are rdf:RDF and rdf:Description tags.")
169
170             # continue 'till the end of RDF section
171             for (event, element) in iter:
172                 if element.tag == RDFNS('RDF') and event == 'end':
173                     break
174
175             # if there is no end, Expat should yell at us with an ExpatError
176
177             # extract data from the element and make the info
178             return cls.from_element(desc_tag, *args, **kwargs)
179         except XMLSyntaxError as e:
180             raise ParseError(e)
181         except ExpatError as e:
182             raise ParseError(e)
183
184     @classmethod
185     def from_element(cls, rdf_tag, *args, **kwargs):
186         # The tree is already parsed,
187         # so we don't need to worry about Expat errors.
188         field_dict = {}
189         desc = rdf_tag.find(".//" + RDFNS('Description'))
190
191         if desc is None:
192             raise NoDublinCore(
193                 "There must be a '%s' element inside the RDF."
194                 % RDFNS('Description')
195             )
196
197         lang = None
198         p = desc
199         while p is not None and lang is None:
200             lang = p.attrib.get(XMLNS('lang'))
201             p = p.getparent()
202
203         for e in desc.getchildren():
204             tag = e.tag
205             if tag == 'meta':
206                 meta_id = e.attrib.get('id')
207                 if meta_id and meta_id.endswith('-id'):
208                     tag = meta_id
209
210             field = cls.get_field_by_uri(tag)
211             if field is None:
212                 # Ignore unknown fields.
213                 continue
214
215             fv = field_dict.get(tag, [])
216             if e.text is not None:
217                 val = field.value_type.from_text(e.text)
218                 val.lang = e.attrib.get(XMLNS('lang'), lang)
219             else:
220                 val = e.text
221             fv.append(val)
222             field_dict[tag] = fv
223
224         return cls(desc.attrib, field_dict, *args, **kwargs)
225
226     def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False, validate_required=True):
227         """
228         rdf_attrs should be a dictionary-like object with any attributes
229         of the RDF:Description.
230         dc_fields - dictionary mapping DC fields (with namespace) to
231         list of text values for the given field.
232         """
233
234         self.about = rdf_attrs.get(RDFNS('about'))
235         self.fmap = {}
236
237         for field in self.FIELDS:
238             value = field.validate(dc_fields, fallbacks=fallbacks,
239                                    strict=strict, validate_required=validate_required)
240             setattr(self, 'prop_' + field.name, value)
241             self.fmap[field.name] = field
242             if field.salias:
243                 self.fmap[field.salias] = field
244
245     def __getattribute__(self, name):
246         try:
247             field = object.__getattribute__(self, 'fmap')[name]
248             value = object.__getattribute__(self, 'prop_'+field.name)
249             if field.name == name:
250                 return value
251             else:  # singular alias
252                 if not field.multiple:
253                     raise "OUCH!! for field %s" % name
254
255                 return value[0] if value else None
256         except (KeyError, AttributeError):
257             return object.__getattribute__(self, name)
258
259     def __setattr__(self, name, newvalue):
260         try:
261             field = object.__getattribute__(self, 'fmap')[name]
262             if field.name == name:
263                 object.__setattr__(self, 'prop_'+field.name, newvalue)
264             else:  # singular alias
265                 if not field.multiple:
266                     raise "OUCH! while setting field %s" % name
267
268                 object.__setattr__(self, 'prop_'+field.name, [newvalue])
269         except (KeyError, AttributeError):
270             return object.__setattr__(self, name, newvalue)
271
272     def update(self, field_dict):
273         """
274         Update using field_dict. Verify correctness, but don't check
275         if all required fields are present.
276         """
277         for field in self.FIELDS:
278             if field.name in field_dict:
279                 setattr(self, field.name, field_dict[field.name])
280
281     def to_etree(self, parent=None):
282         """XML representation of this object."""
283         # etree._namespace_map[str(self.RDF)] = 'rdf'
284         # etree._namespace_map[str(self.DC)] = 'dc'
285
286         if parent is None:
287             root = etree.Element(RDFNS('RDF'))
288         else:
289             root = parent.makeelement(RDFNS('RDF'))
290
291         description = etree.SubElement(root, RDFNS('Description'))
292
293         if self.about:
294             description.set(RDFNS('about'), self.about)
295
296         for field in self.FIELDS:
297             v = getattr(self, field.name, None)
298             if v is not None:
299                 if field.multiple:
300                     if len(v) == 0:
301                         continue
302                     for x in v:
303                         e = etree.Element(field.uri)
304                         if x is not None:
305                             e.text = six.text_type(x)
306                         description.append(e)
307                 else:
308                     e = etree.Element(field.uri)
309                     e.text = six.text_type(v)
310                     description.append(e)
311
312         return root
313
314     def serialize(self):
315         rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}}
316
317         dc = {}
318         for field in self.FIELDS:
319             v = getattr(self, field.name, None)
320             if v is not None:
321                 if field.multiple:
322                     if len(v) == 0:
323                         continue
324                     v = [six.text_type(x) for x in v if x is not None]
325                 else:
326                     v = six.text_type(v)
327
328                 dc[field.name] = {'uri': field.uri, 'value': v}
329         rdf['fields'] = dc
330         return rdf
331
332     def to_dict(self):
333         result = {'about': self.about}
334         for field in self.FIELDS:
335             v = getattr(self, field.name, None)
336
337             if v is not None:
338                 if field.multiple:
339                     if len(v) == 0:
340                         continue
341                     v = [six.text_type(x) for x in v if x is not None]
342                 else:
343                     v = six.text_type(v)
344                 result[field.name] = v
345
346             if field.salias:
347                 v = getattr(self, field.salias)
348                 if v is not None:
349                     result[field.salias] = six.text_type(v)
350
351         return result
352
353
354 class BookInfo(WorkInfo):
355     FIELDS = (
356         Field(DCNS('audience'), 'audiences', text.Audience, salias='audience', multiple=True,
357               required=False),
358
359         Field(DCNS('subject.period'), 'epochs', text.Epoch, salias='epoch', multiple=True,
360               required=False),
361         Field(DCNS('subject.type'), 'kinds', text.Kind, salias='kind', multiple=True,
362               required=False),
363         Field(DCNS('subject.genre'), 'genres', text.Genre, salias='genre', multiple=True,
364               required=False),
365         Field(WLNS('category.legimi'), 'legimi', text.LegimiCategory, required=False),
366         Field(WLNS('category.thema'), 'thema', text.ThemaCategory, required=False, multiple=True),
367         Field(DCNS('subject.location'), 'location', required=False),
368
369         Field(DCNS('contributor.translator'), 'translators',
370               Person,  salias='translator', multiple=True, required=False),
371         Field(DCNS('relation.hasPart'), 'parts', WLURI,
372               multiple=True, required=False),
373         Field(DCNS('relation.isVariantOf'), 'variant_of', WLURI,
374               required=False),
375
376         Field(DCNS('relation.coverImage.url'), 'cover_url', required=False),
377         Field(DCNS('relation.coverImage.attribution'), 'cover_by',
378               required=False),
379         Field(DCNS('relation.coverImage.source'), 'cover_source',
380               required=False),
381         # WLCover-specific.
382         Field(WLNS('coverBarColor'), 'cover_bar_color', required=False),
383         Field(WLNS('coverBoxPosition'), 'cover_box_position', required=False),
384         Field(WLNS('coverClass'), 'cover_class', default=['default']),
385         Field(WLNS('coverLogoUrl'), 'cover_logo_urls', multiple=True,
386               required=False),
387         Field(WLNS('endnotes'), 'endnotes', BoolValue,
388               required=False),
389
390         Field('pdf-id',  'isbn_pdf',  required=False),
391         Field('epub-id', 'isbn_epub', required=False),
392         Field('mobi-id', 'isbn_mobi', required=False),
393         Field('txt-id',  'isbn_txt',  required=False),
394         Field('html-id', 'isbn_html', required=False),
395     )
396
397
398 def parse(file_name, cls=BookInfo):
399     return cls.from_file(file_name)