Basic biblical tools.
[librarian.git] / src / librarian / dcparser.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 from xml.parsers.expat import ExpatError
5 from datetime import date
6 import io
7 import time
8 import re
9 from librarian.util import roman_to_int
10
11 from librarian import (ValidationError, NoDublinCore, ParseError, DCNS, RDFNS,
12                        XMLNS, WLNS, PLMETNS)
13
14 import lxml.etree as etree
15 from lxml.etree import XMLSyntaxError
16
17 from librarian.meta.types.bool import BoolValue
18 from librarian.meta.types.person import Person
19 from librarian.meta.types.wluri import WLURI
20 from librarian.meta.types import text
21
22
23 class Field:
24     def __init__(self, uri, attr_name, value_type=text.TextValue,
25                  multiple=False, salias=None, **kwargs):
26         self.uri = uri
27         self.name = attr_name
28         self.value_type = value_type
29         self.multiple = multiple
30         self.salias = salias
31
32         self.required = (kwargs.get('required', True)
33                          and 'default' not in kwargs)
34         self.default = kwargs.get('default', [] if multiple else [None])
35
36     def validate_value(self, val, strict=False):
37         #if strict:
38         #    value.validate()
39
40         try:
41             if self.multiple:
42                 return val
43             elif len(val) > 1:
44                 raise ValidationError(
45                     "Multiple values not allowed for field '%s'" % self.uri
46                 )
47             elif len(val) == 0:
48                 raise ValidationError(
49                     "Field %s has no value to assign. Check your defaults."
50                     % self.uri
51                 )
52             else:
53                 return val[0]
54         except ValueError as e:
55             raise ValidationError(
56                 "Field '%s' - invald value: %s"
57                 % (self.uri, str(e))
58             )
59
60     def validate(self, fdict, fallbacks=None, strict=False, validate_required=True):
61         if fallbacks is None:
62             fallbacks = {}
63         if self.uri not in fdict:
64             if not self.required:
65                 # Accept single value for single fields and saliases.
66                 if self.name in fallbacks:
67                     if self.multiple:
68                         f = fallbacks[self.name]
69                     else:
70                         f = [fallbacks[self.name]]
71                 elif self.salias and self.salias in fallbacks:
72                     f = [fallbacks[self.salias]]
73                 else:
74                     f = self.default
75             elif validate_required:
76                 raise ValidationError("Required field %s not found" % self.uri)
77             else:
78                 return None
79         else:
80             f = fdict[self.uri]
81
82         return self.validate_value(f, strict=strict)
83
84     def __eq__(self, other):
85         if isinstance(other, Field) and other.name == self.name:
86             return True
87         return False
88
89
90 class BookInfo:
91     FIELDS = (
92         Field(DCNS('creator'), 'authors', Person, salias='author',
93               multiple=True),
94         Field(DCNS('title'), 'title'),
95         Field(DCNS('type'), 'type', required=False, multiple=True),
96
97         Field(DCNS('contributor.editor'), 'editors',
98               Person, salias='editor', multiple=True, required=False),
99         Field(DCNS('contributor.technical_editor'), 'technical_editors',
100               Person, salias='technical_editor', multiple=True,
101               required=False),
102         Field(DCNS('contributor.funding'), 'funders', salias='funder',
103               multiple=True, required=False),
104         Field(DCNS('contributor.thanks'), 'thanks', required=False),
105
106         Field(DCNS('date'), 'created_at'),
107         Field(DCNS('date.pd'), 'released_to_public_domain_at',
108               required=False),
109         Field(DCNS('publisher'), 'publisher', multiple=True),
110
111         Field(DCNS('language'), 'language'),
112         Field(DCNS('description'), 'description', required=False),
113
114         Field(DCNS('source'), 'source_name', required=False),
115         Field(DCNS('source.URL'), 'source_urls', salias='source_url',
116               multiple=True, required=False),
117         Field(DCNS('identifier.url'), 'url', WLURI),
118         Field(DCNS('rights.license'), 'license', required=False),
119         Field(DCNS('rights'), 'license_description'),
120
121         Field(PLMETNS('digitisationSponsor'), 'sponsors', multiple=True,
122               required=False),
123         Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False),
124         Field(WLNS('contentWarning'), 'content_warnings', multiple=True,
125               required=False),
126         Field(WLNS('developmentStage'), 'stage', required=False),
127
128         Field(DCNS('audience'), 'audiences', text.Audience, salias='audience', multiple=True,
129               required=False),
130
131         Field(DCNS('subject.period'), 'epochs', text.Epoch, salias='epoch', multiple=True,
132               required=False),
133         Field(DCNS('subject.type'), 'kinds', text.Kind, salias='kind', multiple=True,
134               required=False),
135         Field(DCNS('subject.genre'), 'genres', text.Genre, salias='genre', multiple=True,
136               required=False),
137         Field('category.legimi', 'legimi', text.LegimiCategory, required=False),
138         Field('category.thema.main', 'thema_main', text.MainThemaCategory, required=False),
139         Field('category.thema', 'thema', text.ThemaCategory, required=False, multiple=True),
140         Field(DCNS('subject.location'), 'location', required=False),
141
142         Field(DCNS('contributor.translator'), 'translators',
143               Person,  salias='translator', multiple=True, required=False),
144         Field(DCNS('relation.hasPart'), 'parts', WLURI,
145               multiple=True, required=False),
146         Field(DCNS('relation.isVariantOf'), 'variant_of', WLURI,
147               required=False),
148
149         Field(DCNS('relation.coverImage.url'), 'cover_url', required=False),
150         Field(DCNS('relation.coverImage.attribution'), 'cover_by',
151               required=False),
152         Field(DCNS('relation.coverImage.source'), 'cover_source',
153               required=False),
154         # WLCover-specific.
155         Field(WLNS('coverBarColor'), 'cover_bar_color', required=False),
156         Field(WLNS('coverBoxPosition'), 'cover_box_position', required=False),
157         Field(WLNS('coverClass'), 'cover_class', default=['default']),
158         Field(WLNS('coverLogoUrl'), 'cover_logo_urls', multiple=True,
159               required=False),
160         Field(WLNS('endnotes'), 'endnotes', BoolValue,
161               required=False),
162
163         Field('pdf-id',  'isbn_pdf',  required=False),
164         Field('epub-id', 'isbn_epub', required=False),
165         Field('mobi-id', 'isbn_mobi', required=False),
166         Field('txt-id',  'isbn_txt',  required=False),
167         Field('html-id', 'isbn_html', required=False),
168
169     )
170
171     @classmethod
172     def get_field_by_uri(cls, uri):
173         for f in cls.FIELDS:
174             if f.uri == uri:
175                 return f
176     
177     @classmethod
178     def from_bytes(cls, xml, *args, **kwargs):
179         return cls.from_file(io.BytesIO(xml), *args, **kwargs)
180
181     @classmethod
182     def from_file(cls, xmlfile, *args, **kwargs):
183         desc_tag = None
184         try:
185             iter = etree.iterparse(xmlfile, ['start', 'end'])
186             for (event, element) in iter:
187                 if element.tag == RDFNS('RDF') and event == 'start':
188                     desc_tag = element
189                     break
190
191             if desc_tag is None:
192                 raise NoDublinCore("DublinCore section not found. \
193                     Check if there are rdf:RDF and rdf:Description tags.")
194
195             # continue 'till the end of RDF section
196             for (event, element) in iter:
197                 if element.tag == RDFNS('RDF') and event == 'end':
198                     break
199
200             # if there is no end, Expat should yell at us with an ExpatError
201
202             # extract data from the element and make the info
203             return cls.from_element(desc_tag, *args, **kwargs)
204         except XMLSyntaxError as e:
205             raise ParseError(e)
206         except ExpatError as e:
207             raise ParseError(e)
208
209     @classmethod
210     def from_element(cls, rdf_tag, *args, **kwargs):
211         # The tree is already parsed,
212         # so we don't need to worry about Expat errors.
213         field_dict = {}
214         desc = rdf_tag.find(".//" + RDFNS('Description'))
215
216         if desc is None:
217             raise NoDublinCore(
218                 "There must be a '%s' element inside the RDF."
219                 % RDFNS('Description')
220             )
221
222         lang = None
223         p = desc
224         while p is not None and lang is None:
225             lang = p.attrib.get(XMLNS('lang'))
226             p = p.getparent()
227
228         for e in desc.getchildren():
229             tag = e.tag
230             if tag == 'meta':
231                 meta_id = e.attrib.get('id')
232                 if meta_id and meta_id.endswith('-id'):
233                     tag = meta_id
234
235             field = cls.get_field_by_uri(tag)
236             if field is None:
237                 # Ignore unknown fields.
238                 continue
239
240             fv = field_dict.get(tag, [])
241             if e.text is not None:
242                 val = field.value_type.from_text(e.text)
243                 val.lang = e.attrib.get(XMLNS('lang'), lang)
244             else:
245                 val = e.text
246             fv.append(val)
247             field_dict[tag] = fv
248
249         return cls(desc.attrib, field_dict, *args, **kwargs)
250
251     def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False, validate_required=True):
252         """
253         rdf_attrs should be a dictionary-like object with any attributes
254         of the RDF:Description.
255         dc_fields - dictionary mapping DC fields (with namespace) to
256         list of text values for the given field.
257         """
258
259         self.about = rdf_attrs.get(RDFNS('about'))
260         self.fmap = {}
261
262         for field in self.FIELDS:
263             value = field.validate(dc_fields, fallbacks=fallbacks,
264                                    strict=strict, validate_required=validate_required)
265             setattr(self, 'prop_' + field.name, value)
266             self.fmap[field.name] = field
267             if field.salias:
268                 self.fmap[field.salias] = field
269
270     def __getattribute__(self, name):
271         try:
272             field = object.__getattribute__(self, 'fmap')[name]
273             value = object.__getattribute__(self, 'prop_'+field.name)
274             if field.name == name:
275                 return value
276             else:  # singular alias
277                 if not field.multiple:
278                     raise "OUCH!! for field %s" % name
279
280                 return value[0] if value else None
281         except (KeyError, AttributeError):
282             return object.__getattribute__(self, name)
283
284     def __setattr__(self, name, newvalue):
285         try:
286             field = object.__getattribute__(self, 'fmap')[name]
287             if field.name == name:
288                 object.__setattr__(self, 'prop_'+field.name, newvalue)
289             else:  # singular alias
290                 if not field.multiple:
291                     raise "OUCH! while setting field %s" % name
292
293                 object.__setattr__(self, 'prop_'+field.name, [newvalue])
294         except (KeyError, AttributeError):
295             return object.__setattr__(self, name, newvalue)
296
297     def update(self, field_dict):
298         """
299         Update using field_dict. Verify correctness, but don't check
300         if all required fields are present.
301         """
302         for field in self.FIELDS:
303             if field.name in field_dict:
304                 setattr(self, field.name, field_dict[field.name])
305
306     def to_etree(self, parent=None):
307         """XML representation of this object."""
308         # etree._namespace_map[str(self.RDF)] = 'rdf'
309         # etree._namespace_map[str(self.DC)] = 'dc'
310
311         if parent is None:
312             root = etree.Element(RDFNS('RDF'))
313         else:
314             root = parent.makeelement(RDFNS('RDF'))
315
316         description = etree.SubElement(root, RDFNS('Description'))
317
318         if self.about:
319             description.set(RDFNS('about'), self.about)
320
321         for field in self.FIELDS:
322             v = getattr(self, field.name, None)
323             if v is not None:
324                 if field.multiple:
325                     if len(v) == 0:
326                         continue
327                     for x in v:
328                         e = etree.Element(field.uri)
329                         if x is not None:
330                             e.text = str(x)
331                         description.append(e)
332                 else:
333                     e = etree.Element(field.uri)
334                     e.text = str(v)
335                     description.append(e)
336
337         return root
338
339     def serialize(self):
340         rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}}
341
342         dc = {}
343         for field in self.FIELDS:
344             v = getattr(self, field.name, None)
345             if v is not None:
346                 if field.multiple:
347                     if len(v) == 0:
348                         continue
349                     v = [str(x) for x in v if x is not None]
350                 else:
351                     v = str(v)
352
353                 dc[field.name] = {'uri': field.uri, 'value': v}
354         rdf['fields'] = dc
355         return rdf
356
357     def to_dict(self):
358         result = {'about': self.about}
359         for field in self.FIELDS:
360             v = getattr(self, field.name, None)
361
362             if v is not None:
363                 if field.multiple:
364                     if len(v) == 0:
365                         continue
366                     v = [str(x) for x in v if x is not None]
367                 else:
368                     v = str(v)
369                 result[field.name] = v
370
371             if field.salias:
372                 v = getattr(self, field.salias)
373                 if v is not None:
374                     result[field.salias] = str(v)
375
376         return result
377
378
379 def parse(file_name, cls=BookInfo):
380     return cls.from_file(file_name)