Counting belongs in document.
[librarian.git] / src / librarian / elements / base.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import copy
5 import re
6 from lxml import etree
7 from librarian import dcparser, RDFNS
8 from librarian.util import get_translation
9
10 def last_words(text, n):
11     words = []
12     for w in reversed(text.split()):
13         words.append(w)
14         if len(w) > 2:
15             n -= 1
16             if not n: break
17     if n:
18         return n, text
19     else:
20         return n, ' '.join(reversed(words))
21
22
23 class WLElement(etree.ElementBase):
24     SECTION_PRECEDENCE = None
25     ASIDE = False
26
27     TXT_TOP_MARGIN = 0
28     TXT_BOTTOM_MARGIN = 0
29     TXT_PREFIX = ""
30     TXT_SUFFIX = ""
31
32     HTML_TAG = None
33     HTML_ATTR = {}
34     HTML_CLASS = None
35
36     EPUB_TAG = None
37     EPUB_ATTR = {}
38     EPUB_CLASS = None
39     EPUB_START_CHUNK = False
40    
41     CAN_HAVE_TEXT = True
42     STRIP = False
43     NUMBERING = None
44
45     text_substitutions = [
46         ('---', '—'),
47         ('--', '–'),
48         ('...', '…'),
49         (',,', '„'),
50         ('"', '”'),
51         ('\ufeff', ''),
52         ("'", "\u2019"),
53     ]
54
55     @property
56     def meta_object(self):
57         if not hasattr(self, '_meta_object'):
58             elem = self.find(RDFNS('RDF'))
59             if elem is not None:
60                 self._meta_object = dcparser.BookInfo.from_element(elem)
61             else:
62                 self._meta_object = None
63         return self._meta_object
64
65     @property
66     def meta(self):
67         if self.meta_object is not None:
68             return self.meta_object
69         else:
70             if self.getparent() is not None:
71                 return self.getparent().meta
72             else:
73                 return self.document.base_meta
74
75     @property
76     def gettext(self):
77         return get_translation(self.meta.language).gettext
78
79     def in_context_of(self, setting):
80         parent = self.getparent()
81         if parent is None:
82             return False
83         try:
84             return getattr(parent, setting)
85         except AttributeError:
86             return parent.in_context_of(setting)
87
88     def get_context_map(self, setting, key, default=None):
89         parent = self.getparent()
90         if parent is None:
91             return default
92         try:
93             return getattr(parent, setting)[key]
94         except AttributeError:
95             return parent.get_context_map(setting, key, default)
96
97     def signal(self, signal):
98         parent = self.getparent()
99         if parent is not None:
100             parent.signal(signal)
101     
102     def raw_printable_text(self, builder):
103         from librarian.html import raw_printable_text
104
105         # TODO: podtagi, wyroznienia, etc
106         t = ''
107         t += self.normalize_text(self.text, builder)
108         for c in self:
109             if not isinstance(c, WLElement):
110                 continue
111             if c.tag not in ('pe', 'pa', 'pt', 'pr', 'motyw'):
112                 t += c.raw_printable_text(builder)
113             t += self.normalize_text(c.tail, builder)
114         return t
115     
116     def normalize_text(self, text, builder):
117         text = text or ''
118         for e, s in self.text_substitutions:
119             text = text.replace(e, s)
120
121         if getattr(builder, 'normalize_whitespace', False):
122             text = re.sub(r'\s+', ' ', text)
123
124         if getattr(builder, 'hyphenator', None) is not None:
125             newt = ''
126             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(text)
127             for w in wlist:
128                 newt += builder.hyphenator.inserted(w, '\u00AD')
129             text = newt
130
131         if builder.orphans:
132             text = re.sub(r'(?<=\s\w)\s+', '\u00A0', text)
133
134         return text
135
136     def _build_inner(self, builder, build_method):
137         child_count = len(self)
138         if self.CAN_HAVE_TEXT and self.text:
139             text = self.normalize_text(self.text, builder)
140             if self.STRIP:
141                 text = text.lstrip()
142                 if not child_count:
143                     text = text.rstrip()
144             builder.push_text(text)
145         for i, child in enumerate(self):
146             real_child_count = 0
147             if isinstance(child, WLElement):
148                 getattr(child, build_method)(builder)
149                 self.after_child(builder, real_child_count)
150                 real_child_count += 1
151
152             # FIXME base builder api
153             elif getattr(builder, 'debug', False) and child.tag is etree.Comment:
154                 builder.process_comment(child)
155             if self.CAN_HAVE_TEXT and child.tail:
156                 text = self.normalize_text(child.tail, builder)
157                 if self.STRIP and i == child_count - 1:
158                     text = text.rstrip()
159                 builder.push_text(text)
160
161     def after_child(self, builder, child_count):
162         fn = getattr(builder, 'after_child_fn', None)
163         if fn:
164             getattr(self, builder.after_child_fn)(builder, child_count)
165
166     def txt_after_child(self, builder, child_count):
167         pass
168
169     def _txt_build_inner(self, builder):
170         self._build_inner(builder, 'txt_build')
171
172     def txt_build(self, builder):
173         builder.push_margin(self.TXT_TOP_MARGIN)
174         builder.push_text(self.TXT_PREFIX, True)
175         self._txt_build_inner(builder)
176         builder.push_text(self.TXT_SUFFIX, True)
177         builder.push_margin(self.TXT_BOTTOM_MARGIN)
178
179     def _html_build_inner(self, builder):
180         self._build_inner(builder, 'html_build')
181
182     def get_html_attr(self, builder):
183         attr = self.HTML_ATTR.copy()
184         if self.HTML_CLASS:
185             attr['class'] = self.HTML_CLASS
186         if builder.with_ids:
187             # always copy the id attribute (?)
188             if self.attrib.get('id'):
189                 attr['id'] = self.attrib['id']
190             if self.attrib.get('_id'):
191                 attr['id'] = self.attrib['_id']
192         return attr
193
194     def html_build(self, builder):
195         # Do we need a number?
196         numbering = self.numbering
197         if numbering == 'main':
198             if builder.with_numbering and self.has_visible_numbering:
199                 builder.add_visible_number(self)
200
201         if self.HTML_TAG:
202             builder.start_element(
203                 self.HTML_TAG,
204                 self.get_html_attr(builder),
205             )
206
207         self._html_build_inner(builder)
208         if self.HTML_TAG:
209             builder.end_element()
210
211     def _epub_build_inner(self, builder):
212         self._build_inner(builder, 'epub_build')
213
214     def get_epub_attr(self, builder):
215         attr = self.EPUB_ATTR.copy()
216         if self.EPUB_CLASS:
217             attr['class'] = self.EPUB_CLASS
218         return attr
219
220     def epub_build(self, builder):
221         from librarian.elements.masters import Master
222
223         # TEMPORARY
224         self.CAN_HAVE_TEXT = True
225         self.STRIP = False
226
227         start_chunk = self.EPUB_START_CHUNK and isinstance(self.getparent(), Master)
228
229         if start_chunk:
230             builder.start_chunk()
231
232         fragment = None
233         if self.SECTION_PRECEDENCE and not self.in_context_of('NO_TOC'):
234             if not start_chunk:
235                 fragment = 'sub%d' % builder.assign_section_number()
236                 self.attrib['id'] = fragment
237
238             builder.add_toc_entry(
239                 fragment,
240                 self.raw_printable_text(builder),
241                 self.SECTION_PRECEDENCE
242             )
243             
244         if self.EPUB_TAG:
245             attr = self.get_epub_attr(builder)
246             if fragment:
247                 attr['id'] = fragment
248             if builder.debug:
249                 chunkno, sourceline = 0, self.sourceline
250                 if builder.splits:
251                     chunkno, sourceline = len(builder.splits), sourceline - builder.splits[-1]
252                 attr['data-debug'] = f'{chunkno}:{sourceline}'
253             builder.start_element(
254                 self.EPUB_TAG,
255                 attr
256             )
257
258         self._epub_build_inner(builder)
259         if self.EPUB_TAG:
260             builder.end_element()
261
262     def validate(self):
263         from librarian.elements.masters import Master
264         from librarian.elements.blocks import DlugiCytat, PoezjaCyt
265         from librarian.elements.footnotes import Footnote
266
267         if self.SECTION_PRECEDENCE:
268             assert isinstance(self.getparent(), (Master, DlugiCytat, PoezjaCyt, Footnote)), \
269                     'Header {} inside a <{}> instead of a master.'.format(
270                             etree.tostring(self, encoding='unicode'), self.getparent().tag)
271
272         for c in self:
273             if isinstance(c, WLElement):
274                 c.validate()
275
276
277     def sanitize(self):
278         # TODO: Remove insanity here.
279         for e in self:
280             if isinstance(e, WLElement):
281                 e.sanitize()
282
283     def snip(self, words, before=None, sub=False):
284         if sub and self.ASIDE:
285             return words, []
286
287         snippet = []
288         if before is not None:
289             i = self.index(before)
290         else:
291             i = len(self)
292
293         while i > 0:
294             i -= 1
295             if self[i].tail:
296                 if words:
297                     words, text = last_words(self[i].tail, words)
298                     snippet = [('text', text)] + snippet
299
300             if words:
301                 words, subsnip = self[i].snip(words, sub=True)
302                 snippet = subsnip + snippet
303
304         if words and self.text:
305             words, text = last_words(self.text, words)
306             snippet = [('text', text)] + snippet
307                     
308         snippet = [('start', self.tag, self.attrib)] + snippet + [('end',)]
309
310         if not sub and words and not self.ASIDE:
311             # do we dare go up?
312             parent = self.getparent()
313             if parent is not None and parent.CAN_HAVE_TEXT:
314                 words, parsnip = parent.snip(words, before=self)
315                 return words, parsnip[:-1] + snippet + parsnip[-1:]
316
317         return words, snippet
318
319     def get_snippet(self, words=15):
320         from librarian.parser import parser
321
322         words, snippet = self.getparent().snip(words=words, before=self)
323         
324         cursor = snipelem = parser.makeelement('snippet')
325         snipelem._meta_object = self.meta
326         for s in snippet:
327             if s[0] == 'start':
328                 elem = parser.makeelement(s[1], **s[2])
329                 cursor.append(elem)
330                 cursor = elem
331             elif s[0] == 'end':
332                 cursor = cursor.getparent()
333             else:
334                 if len(cursor):
335                     cursor[-1].tail = (cursor[-1].tail or '') + s[1]
336                 else:
337                     cursor.text = (cursor.text or '') + s[1]
338
339         return snipelem
340
341     @property
342     def numbering(self):
343         numbering = self.NUMBERING
344         if numbering is None or self.in_context_of('DISABLE_NUMBERING'):
345             return None
346         numbering = self.get_context_map('SUPPRESS_NUMBERING', numbering, numbering)
347         return numbering
348
349     @property
350     def id_prefix(self):
351         prefix = self.numbering
352         if prefix == 'main':
353             # TODO: self.context.main_numbering_prefix
354             prefix = 'f' # default numbering prefix
355         return prefix
356
357     def assign_id(self, document):
358         numbering = self.numbering
359         if numbering:
360             number = str(document.counters[numbering])
361             self.attrib['_id'] = self.id_prefix + number
362             document.counters[numbering] += 1
363
364             if numbering == 'main':
365                 self.attrib['_visible_numbering'] = str(document.counters['_visible'])
366                 document.counters['_visible'] += 1
367
368             if numbering == 'fn':
369                 self.attrib['_visible_numbering'] = number
370
371     def get_link(self):
372         return self.attrib.get('_id') or self.getparent().get_link()
373
374
375 class Snippet(WLElement):
376     pass