aa6f37f431748481fae2f63132fc8024503afc01
[librarian.git] / src / librarian / document.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import gettext
5 import os
6 import re
7 import urllib.request
8 from lxml import etree
9 from .parser import parser
10 from . import dcparser, DCNS, DirDocProvider
11 from .functions import lang_code_3to2
12
13
14 class WLDocument:
15     def __init__(self, filename=None, url=None, provider=None):
16         source = filename or urllib.request.urlopen(url)
17         tree = etree.parse(source, parser=parser)
18         self.tree = tree
19         tree.getroot().document = self
20         self.base_meta = dcparser.BookInfo({}, {
21             DCNS('language'): ["pol"],
22         }, validate_required=False)
23
24         self.provider = provider if provider is not None else DirDocProvider('.')
25
26         self.tree.getroot().validate()
27
28     @property
29     def meta(self):
30         # Allow metadata of the master element as document meta.
31         #master = self.tree.getroot()[-1]
32         return self.tree.getroot().meta
33         return master.meta
34
35     @property
36     def children(self):
37         for part_uri in self.meta.parts or []:
38             with self.provider.by_slug(part_uri.slug) as f:
39                 yield type(self)(filename=f, provider=self.provider)
40     
41     def build(self, builder, base_url=None, **kwargs):
42         return builder(base_url=base_url).build(self, **kwargs)
43
44     def assign_ids(self, existing=None):
45         # Find all existing IDs.
46         existing = existing or set()
47         que = [self.tree.getroot()]
48         while que:
49             item = que.pop(0)
50             try:
51                 item.normalize_insides()
52             except AttributeError:
53                 pass
54             existing.add(item.attrib.get('id'))
55             que.extend(item)
56
57         i = 1
58         que = [self.tree.getroot()]
59         while que:
60             item = que.pop(0)
61             que.extend(item)
62             if item.attrib.get('id'):
63                 continue
64             if not getattr(item, 'SHOULD_HAVE_ID', False):
65                 continue
66             while f'e{i}' in existing:
67                 i += 1
68             item.attrib['id'] = f'e{i}'
69             i += 1
70     
71     def _compat_assign_ordered_ids(self):
72         """
73         Compatibility: ids in document order, to be roughly compatible with legacy
74         footnote ids. Just for testing consistency, change to some sane identifiers
75         at convenience.
76         """
77         EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
78         def _compat_assign_ordered_ids_in_elem(elem, i):
79             elem.attrib['_compat_ordered_id'] = str(i)
80             i += 1
81             if getattr(elem, 'HTML_CLASS', None) == 'stanza':
82                 if elem.text:
83                     i += len(EXPR.split(elem.text)) - 1
84                 for sub in elem:
85                     i = _compat_assign_ordered_ids_in_elem(sub, i)
86                     if sub.tail:
87                         i += len(EXPR.split(sub.tail)) - 1
88             else:
89                 if elem.tag in ('uwaga', 'extra'):
90                     return i
91                 for sub in elem:
92                     i = _compat_assign_ordered_ids_in_elem(sub, i)
93             return i
94
95         _compat_assign_ordered_ids_in_elem(self.tree.getroot(), 4)
96
97     def _compat_assign_section_ids(self):
98         """
99         Ids in master-section order. These need to be compatible with the
100         #secN anchors used by WL search results page to link to fragments.
101         """
102         def _compat_assigns_section_ids_in_elem(elem, prefix='sec'):
103             for i, child in enumerate(elem):
104                 idfier = '{}{}'.format(prefix, i + 1)
105                 try:
106                     child.attrib['_compat_section_id'] = idfier
107                 except:
108                     pass
109                 _compat_assigns_section_ids_in_elem(child, idfier + '-')
110         _compat_assigns_section_ids_in_elem(self.tree.getroot().master)
111
112
113     def editors(self):
114         persons = set(self.meta.editors
115                       + self.meta.technical_editors)
116         for child in self.children:
117             persons.update(child.editors())
118         if None in persons:
119             persons.remove(None)
120         return persons
121
122     def references(self):
123         return self.tree.findall('.//ref')