910f5e1ae144b65791f23aaeb36e2ab9a5b030d1
[librarian.git] / src / librarian / dcparser.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 from xml.parsers.expat import ExpatError
5 from datetime import date
6 import io
7 import time
8 import re
9 from librarian.util import roman_to_int
10
11 from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
12                        XMLNS, WLNS, PLMETNS)
13
14 import lxml.etree as etree
15 from lxml.etree import XMLSyntaxError
16
17 from librarian.meta.types.bool import BoolValue
18 from librarian.meta.types.person import Person
19 from librarian.meta.types.wluri import WLURI
20 from librarian.meta.types import text
21
22
23 class Field:
24     def __init__(self, uri, attr_name, value_type=text.TextValue,
25                  multiple=False, salias=None, **kwargs):
26         self.uri = uri
27         self.name = attr_name
28         self.value_type = value_type
29         self.multiple = multiple
30         self.salias = salias
31
32         self.required = (kwargs.get('required', True)
33                          and 'default' not in kwargs)
34         self.default = kwargs.get('default', [] if multiple else [None])
35
36     def validate_value(self, val, strict=False):
37         #if strict:
38         #    value.validate()
39
40         try:
41             if self.multiple:
42                 return val
43             elif len(val) > 1:
44                 raise ValidationError(
45                     "Multiple values not allowed for field '%s'" % self.uri
46                 )
47             elif len(val) == 0:
48                 raise ValidationError(
49                     "Field %s has no value to assign. Check your defaults."
50                     % self.uri
51                 )
52             else:
53                 return val[0]
54         except ValueError as e:
55             raise ValidationError(
56                 "Field '%s' - invald value: %s"
57                 % (self.uri, str(e))
58             )
59
60     def validate(self, fdict, fallbacks=None, strict=False, validate_required=True):
61         if fallbacks is None:
62             fallbacks = {}
63         if self.uri not in fdict:
64             if not self.required:
65                 # Accept single value for single fields and saliases.
66                 if self.name in fallbacks:
67                     if self.multiple:
68                         f = fallbacks[self.name]
69                     else:
70                         f = [fallbacks[self.name]]
71                 elif self.salias and self.salias in fallbacks:
72                     f = [fallbacks[self.salias]]
73                 else:
74                     f = self.default
75             elif validate_required:
76                 raise ValidationError("Required field %s not found" % self.uri)
77             else:
78                 return None
79         else:
80             f = fdict[self.uri]
81
82         return self.validate_value(f, strict=strict)
83
84     def __eq__(self, other):
85         if isinstance(other, Field) and other.name == self.name:
86             return True
87         return False
88
89
90 class DCInfo(type):
91     def __new__(mcs, classname, bases, class_dict):
92         fields = list(class_dict['FIELDS'])
93
94         for base in bases[::-1]:
95             if hasattr(base, 'FIELDS'):
96                 for field in base.FIELDS[::-1]:
97                     try:
98                         fields.index(field)
99                     except ValueError:
100                         fields.insert(0, field)
101
102         class_dict['FIELDS'] = tuple(fields)
103         return super(DCInfo, mcs).__new__(mcs, classname, bases, class_dict)
104
105
106 class WorkInfo(metaclass=DCInfo):
107     FIELDS = (
108         Field(DCNS('creator'), 'authors', Person, salias='author',
109               multiple=True),
110         Field(DCNS('title'), 'title'),
111         Field(DCNS('type'), 'type', required=False, multiple=True),
112
113         Field(DCNS('contributor.editor'), 'editors',
114               Person, salias='editor', multiple=True, required=False),
115         Field(DCNS('contributor.technical_editor'), 'technical_editors',
116               Person, salias='technical_editor', multiple=True,
117               required=False),
118         Field(DCNS('contributor.funding'), 'funders', salias='funder',
119               multiple=True, required=False),
120         Field(DCNS('contributor.thanks'), 'thanks', required=False),
121
122         Field(DCNS('date'), 'created_at'),
123         Field(DCNS('date.pd'), 'released_to_public_domain_at',
124               required=False),
125         Field(DCNS('publisher'), 'publisher', multiple=True),
126
127         Field(DCNS('language'), 'language'),
128         Field(DCNS('description'), 'description', required=False),
129
130         Field(DCNS('source'), 'source_name', required=False),
131         Field(DCNS('source.URL'), 'source_urls', salias='source_url',
132               multiple=True, required=False),
133         Field(DCNS('identifier.url'), 'url', WLURI),
134         Field(DCNS('rights.license'), 'license', required=False),
135         Field(DCNS('rights'), 'license_description'),
136
137         Field(PLMETNS('digitisationSponsor'), 'sponsors', multiple=True,
138               required=False),
139         Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False),
140         Field(WLNS('contentWarning'), 'content_warnings', multiple=True,
141               required=False),
142         Field(WLNS('developmentStage'), 'stage', required=False),
143     )
144
145     @classmethod
146     def get_field_by_uri(cls, uri):
147         for f in cls.FIELDS:
148             if f.uri == uri:
149                 return f
150     
151     @classmethod
152     def from_bytes(cls, xml, *args, **kwargs):
153         return cls.from_file(io.BytesIO(xml), *args, **kwargs)
154
155     @classmethod
156     def from_file(cls, xmlfile, *args, **kwargs):
157         desc_tag = None
158         try:
159             iter = etree.iterparse(xmlfile, ['start', 'end'])
160             for (event, element) in iter:
161                 if element.tag == RDFNS('RDF') and event == 'start':
162                     desc_tag = element
163                     break
164
165             if desc_tag is None:
166                 raise NoDublinCore("DublinCore section not found. \
167                     Check if there are rdf:RDF and rdf:Description tags.")
168
169             # continue 'till the end of RDF section
170             for (event, element) in iter:
171                 if element.tag == RDFNS('RDF') and event == 'end':
172                     break
173
174             # if there is no end, Expat should yell at us with an ExpatError
175
176             # extract data from the element and make the info
177             return cls.from_element(desc_tag, *args, **kwargs)
178         except XMLSyntaxError as e:
179             raise ParseError(e)
180         except ExpatError as e:
181             raise ParseError(e)
182
183     @classmethod
184     def from_element(cls, rdf_tag, *args, **kwargs):
185         # The tree is already parsed,
186         # so we don't need to worry about Expat errors.
187         field_dict = {}
188         desc = rdf_tag.find(".//" + RDFNS('Description'))
189
190         if desc is None:
191             raise NoDublinCore(
192                 "There must be a '%s' element inside the RDF."
193                 % RDFNS('Description')
194             )
195
196         lang = None
197         p = desc
198         while p is not None and lang is None:
199             lang = p.attrib.get(XMLNS('lang'))
200             p = p.getparent()
201
202         for e in desc.getchildren():
203             tag = e.tag
204             if tag == 'meta':
205                 meta_id = e.attrib.get('id')
206                 if meta_id and meta_id.endswith('-id'):
207                     tag = meta_id
208
209             field = cls.get_field_by_uri(tag)
210             if field is None:
211                 # Ignore unknown fields.
212                 continue
213
214             fv = field_dict.get(tag, [])
215             if e.text is not None:
216                 val = field.value_type.from_text(e.text)
217                 val.lang = e.attrib.get(XMLNS('lang'), lang)
218             else:
219                 val = e.text
220             fv.append(val)
221             field_dict[tag] = fv
222
223         return cls(desc.attrib, field_dict, *args, **kwargs)
224
225     def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False, validate_required=True):
226         """
227         rdf_attrs should be a dictionary-like object with any attributes
228         of the RDF:Description.
229         dc_fields - dictionary mapping DC fields (with namespace) to
230         list of text values for the given field.
231         """
232
233         self.about = rdf_attrs.get(RDFNS('about'))
234         self.fmap = {}
235
236         for field in self.FIELDS:
237             value = field.validate(dc_fields, fallbacks=fallbacks,
238                                    strict=strict, validate_required=validate_required)
239             setattr(self, 'prop_' + field.name, value)
240             self.fmap[field.name] = field
241             if field.salias:
242                 self.fmap[field.salias] = field
243
244     def __getattribute__(self, name):
245         try:
246             field = object.__getattribute__(self, 'fmap')[name]
247             value = object.__getattribute__(self, 'prop_'+field.name)
248             if field.name == name:
249                 return value
250             else:  # singular alias
251                 if not field.multiple:
252                     raise "OUCH!! for field %s" % name
253
254                 return value[0] if value else None
255         except (KeyError, AttributeError):
256             return object.__getattribute__(self, name)
257
258     def __setattr__(self, name, newvalue):
259         try:
260             field = object.__getattribute__(self, 'fmap')[name]
261             if field.name == name:
262                 object.__setattr__(self, 'prop_'+field.name, newvalue)
263             else:  # singular alias
264                 if not field.multiple:
265                     raise "OUCH! while setting field %s" % name
266
267                 object.__setattr__(self, 'prop_'+field.name, [newvalue])
268         except (KeyError, AttributeError):
269             return object.__setattr__(self, name, newvalue)
270
271     def update(self, field_dict):
272         """
273         Update using field_dict. Verify correctness, but don't check
274         if all required fields are present.
275         """
276         for field in self.FIELDS:
277             if field.name in field_dict:
278                 setattr(self, field.name, field_dict[field.name])
279
280     def to_etree(self, parent=None):
281         """XML representation of this object."""
282         # etree._namespace_map[str(self.RDF)] = 'rdf'
283         # etree._namespace_map[str(self.DC)] = 'dc'
284
285         if parent is None:
286             root = etree.Element(RDFNS('RDF'))
287         else:
288             root = parent.makeelement(RDFNS('RDF'))
289
290         description = etree.SubElement(root, RDFNS('Description'))
291
292         if self.about:
293             description.set(RDFNS('about'), self.about)
294
295         for field in self.FIELDS:
296             v = getattr(self, field.name, None)
297             if v is not None:
298                 if field.multiple:
299                     if len(v) == 0:
300                         continue
301                     for x in v:
302                         e = etree.Element(field.uri)
303                         if x is not None:
304                             e.text = str(x)
305                         description.append(e)
306                 else:
307                     e = etree.Element(field.uri)
308                     e.text = str(v)
309                     description.append(e)
310
311         return root
312
313     def serialize(self):
314         rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}}
315
316         dc = {}
317         for field in self.FIELDS:
318             v = getattr(self, field.name, None)
319             if v is not None:
320                 if field.multiple:
321                     if len(v) == 0:
322                         continue
323                     v = [str(x) for x in v if x is not None]
324                 else:
325                     v = str(v)
326
327                 dc[field.name] = {'uri': field.uri, 'value': v}
328         rdf['fields'] = dc
329         return rdf
330
331     def to_dict(self):
332         result = {'about': self.about}
333         for field in self.FIELDS:
334             v = getattr(self, field.name, None)
335
336             if v is not None:
337                 if field.multiple:
338                     if len(v) == 0:
339                         continue
340                     v = [str(x) for x in v if x is not None]
341                 else:
342                     v = str(v)
343                 result[field.name] = v
344
345             if field.salias:
346                 v = getattr(self, field.salias)
347                 if v is not None:
348                     result[field.salias] = str(v)
349
350         return result
351
352
353 class BookInfo(WorkInfo):
354     FIELDS = (
355         Field(DCNS('audience'), 'audiences', text.Audience, salias='audience', multiple=True,
356               required=False),
357
358         Field(DCNS('subject.period'), 'epochs', text.Epoch, salias='epoch', multiple=True,
359               required=False),
360         Field(DCNS('subject.type'), 'kinds', text.Kind, salias='kind', multiple=True,
361               required=False),
362         Field(DCNS('subject.genre'), 'genres', text.Genre, salias='genre', multiple=True,
363               required=False),
364         Field('category.legimi', 'legimi', text.LegimiCategory, required=False),
365         Field('category.thema.main', 'thema_main', text.MainThemaCategory, required=False),
366         Field('category.thema', 'thema', text.ThemaCategory, required=False, multiple=True),
367         Field(DCNS('subject.location'), 'location', required=False),
368
369         Field(DCNS('contributor.translator'), 'translators',
370               Person,  salias='translator', multiple=True, required=False),
371         Field(DCNS('relation.hasPart'), 'parts', WLURI,
372               multiple=True, required=False),
373         Field(DCNS('relation.isVariantOf'), 'variant_of', WLURI,
374               required=False),
375
376         Field(DCNS('relation.coverImage.url'), 'cover_url', required=False),
377         Field(DCNS('relation.coverImage.attribution'), 'cover_by',
378               required=False),
379         Field(DCNS('relation.coverImage.source'), 'cover_source',
380               required=False),
381         # WLCover-specific.
382         Field(WLNS('coverBarColor'), 'cover_bar_color', required=False),
383         Field(WLNS('coverBoxPosition'), 'cover_box_position', required=False),
384         Field(WLNS('coverClass'), 'cover_class', default=['default']),
385         Field(WLNS('coverLogoUrl'), 'cover_logo_urls', multiple=True,
386               required=False),
387         Field(WLNS('endnotes'), 'endnotes', BoolValue,
388               required=False),
389
390         Field('pdf-id',  'isbn_pdf',  required=False),
391         Field('epub-id', 'isbn_epub', required=False),
392         Field('mobi-id', 'isbn_mobi', required=False),
393         Field('txt-id',  'isbn_txt',  required=False),
394         Field('html-id', 'isbn_html', required=False),
395     )
396
397
398 def parse(file_name, cls=BookInfo):
399     return cls.from_file(file_name)