fe4b3fde74418a0bdde9c5dbef0c8c7ff94e3e11
[librarian.git] / src / librarian / dcparser.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from xml.parsers.expat import ExpatError
5 from datetime import date
6 import time
7 import re
8 import six
9 from librarian.util import roman_to_int
10
11 from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
12                        XMLNS, WLURI, WLNS, PLMETNS)
13
14 import lxml.etree as etree
15 from lxml.etree import XMLSyntaxError
16
17 from librarian.meta.types.bool import BoolValue
18 from librarian.meta.types.date import DateValue
19 from librarian.meta.types.person import Person
20 from librarian.meta.types.text import TextValue
21
22
23 class Field(object):
24     def __init__(self, uri, attr_name, validator=TextValue, strict=None,
25                  multiple=False, salias=None, **kwargs):
26         self.uri = uri
27         self.name = attr_name
28         self.validator = validator
29         self.strict = strict
30         self.multiple = multiple
31         self.salias = salias
32
33         self.required = (kwargs.get('required', True)
34                          and 'default' not in kwargs)
35         self.default = kwargs.get('default', [] if multiple else [None])
36
37     def validate_value(self, val, strict=False):
38         if strict and self.strict is not None:
39             validator = self.strict
40         else:
41             validator = self.validator
42         try:
43             if self.multiple:
44                 if validator is None:
45                     return val
46                 new_values = []
47                 for v in val:
48                     nv = v
49                     if v is not None:
50                         #nv = validator(v)
51                         nv = v
52                         if hasattr(v, 'lang'):
53                             setattr(nv, 'lang', v.lang)
54                     new_values.append(nv)
55                 return new_values
56             elif len(val) > 1:
57                 raise ValidationError(
58                     "Multiple values not allowed for field '%s'" % self.uri
59                 )
60             elif len(val) == 0:
61                 raise ValidationError(
62                     "Field %s has no value to assign. Check your defaults."
63                     % self.uri
64                 )
65             else:
66                 if validator is None or val[0] is None:
67                     return val[0]
68                 #nv = validator(val[0])
69                 nv = val[0]
70                 if hasattr(val[0], 'lang') and not hasattr(validator, 'no_lang'):
71                     setattr(nv, 'lang', val[0].lang)
72                 return nv
73         except ValueError as e:
74             raise ValidationError(
75                 "Field '%s' - invald value: %s"
76                 % (self.uri, str(e))
77             )
78
79     def validate(self, fdict, fallbacks=None, strict=False, validate_required=True):
80         if fallbacks is None:
81             fallbacks = {}
82         if self.uri not in fdict:
83             if not self.required:
84                 # Accept single value for single fields and saliases.
85                 if self.name in fallbacks:
86                     if self.multiple:
87                         f = fallbacks[self.name]
88                     else:
89                         f = [fallbacks[self.name]]
90                 elif self.salias and self.salias in fallbacks:
91                     f = [fallbacks[self.salias]]
92                 else:
93                     f = self.default
94             elif validate_required:
95                 raise ValidationError("Required field %s not found" % self.uri)
96             else:
97                 return None
98         else:
99             f = fdict[self.uri]
100
101         return self.validate_value(f, strict=strict)
102
103     def __eq__(self, other):
104         if isinstance(other, Field) and other.name == self.name:
105             return True
106         return False
107
108
109 class DCInfo(type):
110     def __new__(mcs, classname, bases, class_dict):
111         fields = list(class_dict['FIELDS'])
112
113         for base in bases[::-1]:
114             if hasattr(base, 'FIELDS'):
115                 for field in base.FIELDS[::-1]:
116                     try:
117                         fields.index(field)
118                     except ValueError:
119                         fields.insert(0, field)
120
121         class_dict['FIELDS'] = tuple(fields)
122         return super(DCInfo, mcs).__new__(mcs, classname, bases, class_dict)
123
124
125 class WorkInfo(six.with_metaclass(DCInfo, object)):
126     FIELDS = (
127         Field(DCNS('creator'), 'authors', Person, salias='author',
128               multiple=True),
129         Field(DCNS('title'), 'title'),
130         Field(DCNS('type'), 'type', required=False, multiple=True),
131
132         Field(DCNS('contributor.editor'), 'editors',
133               Person, salias='editor', multiple=True, required=False),
134         Field(DCNS('contributor.technical_editor'), 'technical_editors',
135               Person, salias='technical_editor', multiple=True,
136               required=False),
137         Field(DCNS('contributor.funding'), 'funders', salias='funder',
138               multiple=True, required=False),
139         Field(DCNS('contributor.thanks'), 'thanks', required=False),
140
141         Field(DCNS('date'), 'created_at'),
142         Field(DCNS('date.pd'), 'released_to_public_domain_at', DateValue,
143               required=False),
144         Field(DCNS('publisher'), 'publisher', multiple=True),
145
146         Field(DCNS('language'), 'language'),
147         Field(DCNS('description'), 'description', required=False),
148
149         Field(DCNS('source'), 'source_name', required=False),
150         Field(DCNS('source.URL'), 'source_urls', salias='source_url',
151               multiple=True, required=False),
152         Field(DCNS('identifier.url'), 'url', WLURI),
153         Field(DCNS('rights.license'), 'license', required=False),
154         Field(DCNS('rights'), 'license_description'),
155
156         Field(PLMETNS('digitisationSponsor'), 'sponsors', multiple=True,
157               required=False),
158         Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False),
159         Field(WLNS('contentWarning'), 'content_warnings', multiple=True,
160               required=False),
161         Field(WLNS('developmentStage'), 'stage', required=False),
162     )
163
164     @classmethod
165     def get_field_by_uri(cls, uri):
166         for f in cls.FIELDS:
167             if f.uri == uri:
168                 return f
169     
170     @classmethod
171     def from_bytes(cls, xml, *args, **kwargs):
172         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
173
174     @classmethod
175     def from_file(cls, xmlfile, *args, **kwargs):
176         desc_tag = None
177         try:
178             iter = etree.iterparse(xmlfile, ['start', 'end'])
179             for (event, element) in iter:
180                 if element.tag == RDFNS('RDF') and event == 'start':
181                     desc_tag = element
182                     break
183
184             if desc_tag is None:
185                 raise NoDublinCore("DublinCore section not found. \
186                     Check if there are rdf:RDF and rdf:Description tags.")
187
188             # continue 'till the end of RDF section
189             for (event, element) in iter:
190                 if element.tag == RDFNS('RDF') and event == 'end':
191                     break
192
193             # if there is no end, Expat should yell at us with an ExpatError
194
195             # extract data from the element and make the info
196             return cls.from_element(desc_tag, *args, **kwargs)
197         except XMLSyntaxError as e:
198             raise ParseError(e)
199         except ExpatError as e:
200             raise ParseError(e)
201
202     @classmethod
203     def from_element(cls, rdf_tag, *args, **kwargs):
204         # The tree is already parsed,
205         # so we don't need to worry about Expat errors.
206         field_dict = {}
207         desc = rdf_tag.find(".//" + RDFNS('Description'))
208
209         if desc is None:
210             raise NoDublinCore(
211                 "There must be a '%s' element inside the RDF."
212                 % RDFNS('Description')
213             )
214
215         lang = None
216         p = desc
217         while p is not None and lang is None:
218             lang = p.attrib.get(XMLNS('lang'))
219             p = p.getparent()
220
221         for e in desc.getchildren():
222             field = cls.get_field_by_uri(e.tag)
223             if field is None:
224                 # Ignore unknown fields.
225                 ### TODO: does it do <meta> for isbn?
226                 continue
227
228             fv = field_dict.get(e.tag, [])
229             if e.text is not None:
230                 val = field.validator(e.text)
231                 val.lang = e.attrib.get(XMLNS('lang'), lang)
232
233                 if e.tag == 'meta':
234                     meta_id = e.attrib.get('id')
235                     if meta_id and meta_id.endswith('-id'):
236                         field_dict[meta_id] = [val.replace('ISBN-', 'ISBN ')]
237             else:
238                 val = e.text
239             fv.append(val)
240             field_dict[e.tag] = fv
241
242         return cls(desc.attrib, field_dict, *args, **kwargs)
243
244     def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False, validate_required=True):
245         """
246         rdf_attrs should be a dictionary-like object with any attributes
247         of the RDF:Description.
248         dc_fields - dictionary mapping DC fields (with namespace) to
249         list of text values for the given field.
250         """
251
252         self.about = rdf_attrs.get(RDFNS('about'))
253         self.fmap = {}
254
255         for field in self.FIELDS:
256             value = field.validate(dc_fields, fallbacks=fallbacks,
257                                    strict=strict, validate_required=validate_required)
258             setattr(self, 'prop_' + field.name, value)
259             self.fmap[field.name] = field
260             if field.salias:
261                 self.fmap[field.salias] = field
262
263     def __getattribute__(self, name):
264         try:
265             field = object.__getattribute__(self, 'fmap')[name]
266             value = object.__getattribute__(self, 'prop_'+field.name)
267             if field.name == name:
268                 return value
269             else:  # singular alias
270                 if not field.multiple:
271                     raise "OUCH!! for field %s" % name
272
273                 return value[0] if value else None
274         except (KeyError, AttributeError):
275             return object.__getattribute__(self, name)
276
277     def __setattr__(self, name, newvalue):
278         try:
279             field = object.__getattribute__(self, 'fmap')[name]
280             if field.name == name:
281                 object.__setattr__(self, 'prop_'+field.name, newvalue)
282             else:  # singular alias
283                 if not field.multiple:
284                     raise "OUCH! while setting field %s" % name
285
286                 object.__setattr__(self, 'prop_'+field.name, [newvalue])
287         except (KeyError, AttributeError):
288             return object.__setattr__(self, name, newvalue)
289
290     def update(self, field_dict):
291         """
292         Update using field_dict. Verify correctness, but don't check
293         if all required fields are present.
294         """
295         for field in self.FIELDS:
296             if field.name in field_dict:
297                 setattr(self, field.name, field_dict[field.name])
298
299     def to_etree(self, parent=None):
300         """XML representation of this object."""
301         # etree._namespace_map[str(self.RDF)] = 'rdf'
302         # etree._namespace_map[str(self.DC)] = 'dc'
303
304         if parent is None:
305             root = etree.Element(RDFNS('RDF'))
306         else:
307             root = parent.makeelement(RDFNS('RDF'))
308
309         description = etree.SubElement(root, RDFNS('Description'))
310
311         if self.about:
312             description.set(RDFNS('about'), self.about)
313
314         for field in self.FIELDS:
315             v = getattr(self, field.name, None)
316             if v is not None:
317                 if field.multiple:
318                     if len(v) == 0:
319                         continue
320                     for x in v:
321                         e = etree.Element(field.uri)
322                         if x is not None:
323                             e.text = six.text_type(x)
324                         description.append(e)
325                 else:
326                     e = etree.Element(field.uri)
327                     e.text = six.text_type(v)
328                     description.append(e)
329
330         return root
331
332     def serialize(self):
333         rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}}
334
335         dc = {}
336         for field in self.FIELDS:
337             v = getattr(self, field.name, None)
338             if v is not None:
339                 if field.multiple:
340                     if len(v) == 0:
341                         continue
342                     v = [six.text_type(x) for x in v if x is not None]
343                 else:
344                     v = six.text_type(v)
345
346                 dc[field.name] = {'uri': field.uri, 'value': v}
347         rdf['fields'] = dc
348         return rdf
349
350     def to_dict(self):
351         result = {'about': self.about}
352         for field in self.FIELDS:
353             v = getattr(self, field.name, None)
354
355             if v is not None:
356                 if field.multiple:
357                     if len(v) == 0:
358                         continue
359                     v = [six.text_type(x) for x in v if x is not None]
360                 else:
361                     v = six.text_type(v)
362                 result[field.name] = v
363
364             if field.salias:
365                 v = getattr(self, field.salias)
366                 if v is not None:
367                     result[field.salias] = six.text_type(v)
368
369         return result
370
371
372 class BookInfo(WorkInfo):
373     FIELDS = (
374         Field(DCNS('audience'), 'audiences', salias='audience', multiple=True,
375               required=False),
376
377         Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True,
378               required=False),
379         Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True,
380               required=False),
381         Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True,
382               required=False),
383         Field(WLNS('category.legimi'), 'legimi', required=False),
384
385         Field(DCNS('subject.location'), 'location', required=False),
386
387         Field(DCNS('contributor.translator'), 'translators',
388               Person,  salias='translator', multiple=True, required=False),
389         Field(DCNS('relation.hasPart'), 'parts', WLURI,
390               multiple=True, required=False),
391         Field(DCNS('relation.isVariantOf'), 'variant_of', WLURI,
392               required=False),
393
394         Field(DCNS('relation.coverImage.url'), 'cover_url', required=False),
395         Field(DCNS('relation.coverImage.attribution'), 'cover_by',
396               required=False),
397         Field(DCNS('relation.coverImage.source'), 'cover_source',
398               required=False),
399         # WLCover-specific.
400         Field(WLNS('coverBarColor'), 'cover_bar_color', required=False),
401         Field(WLNS('coverBoxPosition'), 'cover_box_position', required=False),
402         Field(WLNS('coverClass'), 'cover_class', default=['default']),
403         Field(WLNS('coverLogoUrl'), 'cover_logo_urls', multiple=True,
404               required=False),
405         Field(WLNS('endnotes'), 'endnotes', BoolValue,
406               required=False),
407
408         Field('pdf-id',  'isbn_pdf',  required=False),
409         Field('epub-id', 'isbn_epub', required=False),
410         Field('mobi-id', 'isbn_mobi', required=False),
411         Field('txt-id',  'isbn_txt',  required=False),
412         Field('html-id', 'isbn_html', required=False),
413     )
414
415
416 def parse(file_name, cls=BookInfo):
417     return cls.from_file(file_name)