1f22929a9e17ed61a74673d554e8070f919100ee
[librarian.git] / src / librarian / elements / base.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import copy
5 import re
6 from lxml import etree
7 from librarian import dcparser, RDFNS
8 from librarian.util import get_translation
9
10 def last_words(text, n):
11     words = []
12     for w in reversed(text.split()):
13         words.append(w)
14         if len(w) > 2:
15             n -= 1
16             if not n: break
17     if n:
18         return n, text
19     else:
20         return n, ' '.join(reversed(words))
21
22
23 class WLElement(etree.ElementBase):
24     SECTION_PRECEDENCE = None
25     ASIDE = False
26
27     TXT_TOP_MARGIN = 0
28     TXT_BOTTOM_MARGIN = 0
29     TXT_PREFIX = ""
30     TXT_SUFFIX = ""
31
32     HTML_TAG = None
33     HTML_ATTR = {}
34     HTML_CLASS = None
35
36     EPUB_TAG = None
37     EPUB_ATTR = {}
38     EPUB_CLASS = None
39     EPUB_START_CHUNK = False
40    
41     CAN_HAVE_TEXT = True
42     STRIP = False
43     NUMBERING = None
44
45     text_substitutions = [
46         ('---', '—'),
47         ('--', '–'),
48         ('...', '…'),
49         (',,', '„'),
50         ('"', '”'),
51         ('\ufeff', ''),
52         ("'", "\u2019"),
53     ]
54
55     @property
56     def meta_object(self):
57         if not hasattr(self, '_meta_object'):
58             elem = self.find(RDFNS('RDF'))
59             if elem is not None:
60                 self._meta_object = dcparser.BookInfo.from_element(elem)
61             else:
62                 self._meta_object = None
63         return self._meta_object
64
65     @property
66     def meta(self):
67         if self.meta_object is not None:
68             return self.meta_object
69         else:
70             if self.getparent() is not None:
71                 return self.getparent().meta
72             else:
73                 return self.document.base_meta
74
75     @property
76     def gettext(self):
77         return get_translation(self.meta.language).gettext
78
79     def in_context_of(self, setting):
80         parent = self.getparent()
81         if parent is None:
82             return False
83         try:
84             return getattr(parent, setting)
85         except AttributeError:
86             return parent.in_context_of(setting)
87
88     def get_context_map(self, setting, key, default=None):
89         parent = self.getparent()
90         if parent is None:
91             return default
92         try:
93             return getattr(parent, setting)[key]
94         except AttributeError:
95             return parent.get_context_map(setting, key, default)
96
97     def signal(self, signal):
98         parent = self.getparent()
99         if parent is not None:
100             parent.signal(signal)
101     
102     def raw_printable_text(self, builder):
103         from librarian.html import raw_printable_text
104
105         # TODO: podtagi, wyroznienia, etc
106         t = ''
107         t += self.normalize_text(self.text, builder)
108         for c in self:
109             if not isinstance(c, WLElement):
110                 continue
111             if c.tag not in ('pe', 'pa', 'pt', 'pr', 'motyw'):
112                 t += c.raw_printable_text(builder)
113             t += self.normalize_text(c.tail, builder)
114         return t
115     
116     def normalize_text(self, text, builder):
117         text = text or ''
118         for e, s in self.text_substitutions:
119             text = text.replace(e, s)
120
121         if getattr(builder, 'normalize_whitespace', False):
122             text = re.sub(r'\s+', ' ', text)
123
124         if getattr(builder, 'hyphenator', None) is not None:
125             newt = ''
126             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(text)
127             for w in wlist:
128                 newt += builder.hyphenator.inserted(w, '\u00AD')
129             text = newt
130
131         if builder.orphans:
132             text = re.sub(r'(?<=\s\w)\s+', '\u00A0', text)
133
134         return text
135
136     def build_inner(self, builder):
137         build_method = builder.build_method_fn
138         child_count = len(self)
139         if self.CAN_HAVE_TEXT and self.text:
140             text = self.normalize_text(self.text, builder)
141             if self.STRIP:
142                 text = text.lstrip()
143                 if not child_count:
144                     text = text.rstrip()
145             builder.push_text(text)
146         for i, child in enumerate(self):
147             real_child_count = 0
148             if isinstance(child, WLElement):
149                 getattr(child, build_method)(builder)
150                 self.after_child(builder, real_child_count)
151                 real_child_count += 1
152
153             # FIXME base builder api
154             elif getattr(builder, 'debug', False) and child.tag is etree.Comment:
155                 builder.process_comment(child)
156             if self.CAN_HAVE_TEXT and child.tail:
157                 text = self.normalize_text(child.tail, builder)
158                 if self.STRIP and i == child_count - 1:
159                     text = text.rstrip()
160                 builder.push_text(text)
161
162     def after_child(self, builder, child_count):
163         fn = getattr(builder, 'after_child_fn', None)
164         if fn:
165             getattr(self, builder.after_child_fn)(builder, child_count)
166
167     def txt_after_child(self, builder, child_count):
168         pass
169
170     def txt_build_inner(self, builder):
171         self.build_inner(builder)
172
173     def txt_build(self, builder):
174         builder.push_margin(self.TXT_TOP_MARGIN)
175         builder.push_text(self.TXT_PREFIX, True)
176         self.txt_build_inner(builder)
177         builder.push_text(self.TXT_SUFFIX, True)
178         builder.push_margin(self.TXT_BOTTOM_MARGIN)
179
180     def html_build_inner(self, builder):
181         self.build_inner(builder)
182
183     def get_html_attr(self, builder):
184         attr = self.HTML_ATTR.copy()
185         if self.HTML_CLASS:
186             attr['class'] = self.HTML_CLASS
187         if builder.with_ids:
188             # always copy the id attribute (?)
189             if self.attrib.get('id'):
190                 attr['id'] = self.attrib['id']
191             if self.attrib.get('_id'):
192                 attr['id'] = self.attrib['_id']
193         return attr
194
195     def html_build(self, builder):
196         # Do we need a number?
197         numbering = self.numbering
198         if numbering == 'main':
199             if builder.with_numbering and self.has_visible_numbering:
200                 builder.add_visible_number(self)
201
202         if self.HTML_TAG:
203             builder.start_element(
204                 self.HTML_TAG,
205                 self.get_html_attr(builder),
206             )
207
208         self.html_build_inner(builder)
209         if self.HTML_TAG:
210             builder.end_element()
211
212     def epub_build_inner(self, builder):
213         self.build_inner(builder)
214
215     def get_epub_attr(self, builder):
216         attr = self.EPUB_ATTR.copy()
217         if self.EPUB_CLASS:
218             attr['class'] = self.EPUB_CLASS
219         return attr
220
221     def epub_build(self, builder):
222         from librarian.elements.masters import Master
223
224         # TEMPORARY
225         self.CAN_HAVE_TEXT = True
226         self.STRIP = False
227
228         start_chunk = self.EPUB_START_CHUNK and isinstance(self.getparent(), Master)
229
230         if start_chunk:
231             builder.start_chunk()
232
233         fragment = None
234         if self.SECTION_PRECEDENCE and not self.in_context_of('NO_TOC'):
235             if not start_chunk:
236                 fragment = 'sub%d' % builder.assign_section_number()
237                 self.attrib['id'] = fragment
238
239             builder.add_toc_entry(
240                 fragment,
241                 self.raw_printable_text(builder),
242                 self.SECTION_PRECEDENCE
243             )
244             
245         if self.EPUB_TAG:
246             attr = self.get_epub_attr(builder)
247             if fragment:
248                 attr['id'] = fragment
249             if builder.debug:
250                 chunkno, sourceline = 0, self.sourceline
251                 if builder.splits:
252                     chunkno, sourceline = len(builder.splits), sourceline - builder.splits[-1]
253                 attr['data-debug'] = f'{chunkno}:{sourceline}'
254             builder.start_element(
255                 self.EPUB_TAG,
256                 attr
257             )
258
259         self.epub_build_inner(builder)
260         if self.EPUB_TAG:
261             builder.end_element()
262
263     def validate(self):
264         from librarian.elements.masters import Master
265         from librarian.elements.blocks import DlugiCytat, PoezjaCyt
266         from librarian.elements.footnotes import Footnote
267
268         if self.SECTION_PRECEDENCE:
269             assert isinstance(self.getparent(), (Master, DlugiCytat, PoezjaCyt, Footnote)), \
270                     'Header {} inside a <{}> instead of a master.'.format(
271                             etree.tostring(self, encoding='unicode'), self.getparent().tag)
272
273         for c in self:
274             if isinstance(c, WLElement):
275                 c.validate()
276
277
278     def sanitize(self):
279         # TODO: Remove insanity here.
280         for e in self:
281             if isinstance(e, WLElement):
282                 e.sanitize()
283
284     def snip(self, words, before=None, sub=False):
285         if sub and self.ASIDE:
286             return words, []
287
288         snippet = []
289         if before is not None:
290             i = self.index(before)
291         else:
292             i = len(self)
293
294         while i > 0:
295             i -= 1
296             if self[i].tail:
297                 if words:
298                     words, text = last_words(self[i].tail, words)
299                     snippet = [('text', text)] + snippet
300
301             if words:
302                 words, subsnip = self[i].snip(words, sub=True)
303                 snippet = subsnip + snippet
304
305         if words and self.text:
306             words, text = last_words(self.text, words)
307             snippet = [('text', text)] + snippet
308                     
309         snippet = [('start', self.tag, self.attrib)] + snippet + [('end',)]
310
311         if not sub and words and not self.ASIDE:
312             # do we dare go up?
313             parent = self.getparent()
314             if parent is not None and parent.CAN_HAVE_TEXT:
315                 words, parsnip = parent.snip(words, before=self)
316                 return words, parsnip[:-1] + snippet + parsnip[-1:]
317
318         return words, snippet
319
320     def get_snippet(self, words=15):
321         from librarian.parser import parser
322
323         words, snippet = self.getparent().snip(words=words, before=self)
324         
325         cursor = snipelem = parser.makeelement('snippet')
326         snipelem._meta_object = self.meta
327         for s in snippet:
328             if s[0] == 'start':
329                 elem = parser.makeelement(s[1], **s[2])
330                 cursor.append(elem)
331                 cursor = elem
332             elif s[0] == 'end':
333                 cursor = cursor.getparent()
334             else:
335                 if len(cursor):
336                     cursor[-1].tail = (cursor[-1].tail or '') + s[1]
337                 else:
338                     cursor.text = (cursor.text or '') + s[1]
339
340         return snipelem
341
342     @property
343     def numbering(self):
344         numbering = self.NUMBERING
345         if numbering is None or self.in_context_of('DISABLE_NUMBERING'):
346             return None
347         numbering = self.get_context_map('SUPPRESS_NUMBERING', numbering, numbering)
348         return numbering
349
350     @property
351     def id_prefix(self):
352         prefix = self.numbering
353         if prefix == 'main':
354             # TODO: self.context.main_numbering_prefix
355             prefix = 'f' # default numbering prefix
356         return prefix
357
358     def assign_id(self, document):
359         numbering = self.numbering
360         if numbering:
361             number = str(document.counters[numbering])
362             self.attrib['_id'] = self.id_prefix + number
363             document.counters[numbering] += 1
364
365             if numbering == 'main':
366                 self.attrib['_visible_numbering'] = str(document.counters['_visible'])
367                 document.counters['_visible'] += 1
368
369             if numbering == 'fn':
370                 self.attrib['_visible_numbering'] = number
371
372     def get_link(self):
373         return self.attrib.get('_id') or self.getparent().get_link()
374
375
376 class Snippet(WLElement):
377     pass