fix for bad documents.
[redakcja.git] / src / sources / document.py
1 import os
2 from librarian import RDFNS, DCNS
3 from lxml import etree
4 from datetime import date
5 from . import ocr
6 from django.conf import settings
7
8
9 def build_document_texts(book_source):
10     texts = []
11     for builder in text_builders:
12         root = etree.Element('utwor')
13         # add meta
14         add_rdf(root, book_source)
15
16         # add master
17         master = etree.SubElement(root, 'powiesc')
18
19         for page in book_source.get_ocr_files():
20             builder(master, page)
21     
22         texts.append(etree.tostring(root, encoding='unicode', pretty_print=True))
23     return texts
24
25
26 text_builders = [
27     ocr.add_page_to_master,
28     ocr.add_page_to_master_as_stanzas,
29     ocr.add_page_to_master_as_p,
30 ]
31
32
33 def add_rdf(root, book_source):
34     book = book_source.book
35     
36     # TODO: to librarian
37     rdf = etree.SubElement(root, RDFNS('RDF'))
38     desc = etree.SubElement(rdf, RDFNS('Description'), **{})
39
40     # author
41     for author in book.authors.all():
42         etree.SubElement(desc, DCNS('creator')).text = f'{author.last_name_pl}, {author.first_name_pl}'
43     # translator
44     for tr in book.translators.all():
45         etree.SubElement(desc, DCNS('contributor.translator')).text = f'{tr.last_name_pl}, {tr.first_name_pl}'
46     # title
47     etree.SubElement(desc, DCNS('title')).text = book.title
48     # created_at
49     etree.SubElement(desc, DCNS('date')).text = date.today().isoformat()
50     # date.pd
51     etree.SubElement(desc, DCNS('date.pd')).text = book.pd_year
52     #publisher
53     etree.SubElement(desc, DCNS('publisher')). text = 'Fundacja Wolne Lektury'
54     #language
55     etree.SubElement(desc, DCNS('language')).text = book.language # 3to2?
56     #description
57     #source_name
58     etree.SubElement(desc, DCNS('source')).text = book_source.source.name
59     #url
60     etree.SubElement(desc, DCNS('identifier.url')).text = f'https://wolnelektury.pl/katalog/lektura/{book.slug}/'
61     #license?
62     #license_description?
63     etree.SubElement(desc, DCNS('rights')).text = ''
64     #epochs
65     for tag in book.epochs.all():
66         etree.SubElement(desc, DCNS('subject.period')).text = tag.name
67     #kinds
68     for tag in book.kinds.all():
69         etree.SubElement(desc, DCNS('subject.type')).text = tag.name
70     #genres
71     for tag in book.genres.all():
72         etree.SubElement(desc, DCNS('subject.genre')).text = tag.name
73
74