wip change fb2 api
[librarian.git] / src / librarian / elements / base.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import copy
5 import re
6 from lxml import etree
7 from librarian import dcparser, RDFNS
8 from librarian.util import get_translation
9
10 def last_words(text, n):
11     words = []
12     for w in reversed(text.split()):
13         words.append(w)
14         if len(w) > 2:
15             n -= 1
16             if not n: break
17     if n:
18         return n, text
19     else:
20         return n, ' '.join(reversed(words))
21
22
23 class WLElement(etree.ElementBase):
24     SECTION_PRECEDENCE = None
25     ASIDE = False
26
27     TXT_TOP_MARGIN = 0
28     TXT_BOTTOM_MARGIN = 0
29     TXT_PREFIX = ""
30     TXT_SUFFIX = ""
31
32     HTML_TAG = None
33     HTML_ATTR = {}
34     HTML_CLASS = None
35
36     EPUB_TAG = None
37     EPUB_ATTR = {}
38     EPUB_CLASS = None
39     EPUB_START_CHUNK = False
40
41     FB2_TAG = None
42
43     CAN_HAVE_TEXT = True
44     STRIP = False
45     NUMBERING = None
46
47     text_substitutions = [
48         ('---', '—'),
49         ('--', '–'),
50         ('...', '…'),
51         (',,', '„'),
52         ('"', '”'),
53         ('\ufeff', ''),
54         ("'", "\u2019"),
55     ]
56
57     @property
58     def meta_object(self):
59         if not hasattr(self, '_meta_object'):
60             elem = self.find(RDFNS('RDF'))
61             if elem is not None:
62                 self._meta_object = dcparser.BookInfo.from_element(elem)
63             else:
64                 self._meta_object = None
65         return self._meta_object
66
67     @property
68     def meta(self):
69         if self.meta_object is not None:
70             return self.meta_object
71         else:
72             if self.getparent() is not None:
73                 return self.getparent().meta
74             else:
75                 return self.document.base_meta
76
77     @property
78     def gettext(self):
79         return get_translation(self.meta.language).gettext
80
81     def in_context_of(self, setting):
82         parent = self.getparent()
83         if parent is None:
84             return False
85         try:
86             return getattr(parent, setting)
87         except AttributeError:
88             return parent.in_context_of(setting)
89
90     def get_context_map(self, setting, key, default=None):
91         parent = self.getparent()
92         if parent is None:
93             return default
94         try:
95             return getattr(parent, setting)[key]
96         except AttributeError:
97             return parent.get_context_map(setting, key, default)
98
99     def signal(self, signal):
100         parent = self.getparent()
101         if parent is not None:
102             parent.signal(signal)
103     
104     def raw_printable_text(self, builder):
105         from librarian.html import raw_printable_text
106
107         # TODO: podtagi, wyroznienia, etc
108         t = ''
109         t += self.normalize_text(self.text, builder)
110         for c in self:
111             if not isinstance(c, WLElement):
112                 continue
113             if c.tag not in ('pe', 'pa', 'pt', 'pr', 'motyw'):
114                 t += c.raw_printable_text(builder)
115             t += self.normalize_text(c.tail, builder)
116         return t
117     
118     def normalize_text(self, text, builder):
119         text = text or ''
120         for e, s in self.text_substitutions:
121             text = text.replace(e, s)
122
123         if getattr(builder, 'normalize_whitespace', False):
124             text = re.sub(r'\s+', ' ', text)
125
126         if getattr(builder, 'hyphenator', None) is not None:
127             newt = ''
128             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(text)
129             for w in wlist:
130                 newt += builder.hyphenator.inserted(w, '\u00AD')
131             text = newt
132
133         if builder.orphans:
134             text = re.sub(r'(?<=\s\w)\s+', '\u00A0', text)
135
136         return text
137
138     def build_inner(self, builder):
139         build_method = builder.build_method_fn
140         child_count = len(self)
141         if self.CAN_HAVE_TEXT and self.text:
142             text = self.normalize_text(self.text, builder)
143             if self.STRIP:
144                 text = text.lstrip()
145                 if not child_count:
146                     text = text.rstrip()
147             builder.push_text(text)
148         for i, child in enumerate(self):
149             real_child_count = 0
150             if isinstance(child, WLElement):
151                 getattr(child, build_method)(builder)
152                 self.after_child(builder, real_child_count)
153                 real_child_count += 1
154
155             # FIXME base builder api
156             elif getattr(builder, 'debug', False) and child.tag is etree.Comment:
157                 builder.process_comment(child)
158             if self.CAN_HAVE_TEXT and child.tail:
159                 text = self.normalize_text(child.tail, builder)
160                 if self.STRIP and i == child_count - 1:
161                     text = text.rstrip()
162                 builder.push_text(text)
163
164     def after_child(self, builder, child_count):
165         fn = getattr(builder, 'after_child_fn', None)
166         if fn:
167             getattr(self, builder.after_child_fn)(builder, child_count)
168
169     def txt_after_child(self, builder, child_count):
170         pass
171
172     def txt_build_inner(self, builder):
173         self.build_inner(builder)
174
175     def txt_build(self, builder):
176         builder.push_margin(self.TXT_TOP_MARGIN)
177         builder.push_text(self.TXT_PREFIX, True)
178         self.txt_build_inner(builder)
179         builder.push_text(self.TXT_SUFFIX, True)
180         builder.push_margin(self.TXT_BOTTOM_MARGIN)
181
182     def html_build_inner(self, builder):
183         self.build_inner(builder)
184
185     def get_html_attr(self, builder):
186         attr = self.HTML_ATTR.copy()
187         if self.HTML_CLASS:
188             attr['class'] = self.HTML_CLASS
189         if builder.with_ids:
190             # always copy the id attribute (?)
191             if self.attrib.get('id'):
192                 attr['id'] = self.attrib['id']
193             if self.attrib.get('_id'):
194                 attr['id'] = self.attrib['_id']
195         return attr
196
197     def html_build(self, builder):
198         # Do we need a number?
199         numbering = self.numbering
200         if numbering == 'main':
201             if builder.with_numbering and self.has_visible_numbering:
202                 builder.add_visible_number(self)
203
204         if self.HTML_TAG:
205             builder.start_element(
206                 self.HTML_TAG,
207                 self.get_html_attr(builder),
208             )
209
210         self.html_build_inner(builder)
211         if self.HTML_TAG:
212             builder.end_element()
213
214     def fb2_build(self, builder):
215         if self.SECTION_PRECEDENCE:
216             builder.start_section(self.SECTION_PRECEDENCE)
217             builder.start_element('title')
218             builder.start_element('p')
219
220         if self.FB2_TAG:
221             builder.start_element(
222                 self.FB2_TAG,
223                 #self.get_fb2_attr(builder),
224             )
225
226         self.build_inner(builder)
227         if self.FB2_TAG:
228             builder.end_element()
229         if self.SECTION_PRECEDENCE:
230             builder.end_element()
231             builder.end_element()
232
233     def epub_build_inner(self, builder):
234         self.build_inner(builder)
235
236     def get_epub_attr(self, builder):
237         attr = self.EPUB_ATTR.copy()
238         if self.EPUB_CLASS:
239             attr['class'] = self.EPUB_CLASS
240         return attr
241
242     def epub_build(self, builder):
243         from librarian.elements.masters import Master
244
245         # TEMPORARY
246         self.CAN_HAVE_TEXT = True
247         self.STRIP = False
248
249         start_chunk = self.EPUB_START_CHUNK and isinstance(self.getparent(), Master)
250
251         if start_chunk:
252             builder.start_chunk()
253
254         fragment = None
255         if self.SECTION_PRECEDENCE and not self.in_context_of('NO_TOC'):
256             if not start_chunk:
257                 fragment = 'sub%d' % builder.assign_section_number()
258                 self.attrib['id'] = fragment
259
260             builder.add_toc_entry(
261                 fragment,
262                 self.raw_printable_text(builder),
263                 self.SECTION_PRECEDENCE
264             )
265             
266         if self.EPUB_TAG:
267             attr = self.get_epub_attr(builder)
268             if fragment:
269                 attr['id'] = fragment
270             if builder.debug:
271                 chunkno, sourceline = 0, self.sourceline
272                 if builder.splits:
273                     chunkno, sourceline = len(builder.splits), sourceline - builder.splits[-1]
274                 attr['data-debug'] = f'{chunkno}:{sourceline}'
275             builder.start_element(
276                 self.EPUB_TAG,
277                 attr
278             )
279
280         self.epub_build_inner(builder)
281         if self.EPUB_TAG:
282             builder.end_element()
283
284     def validate(self):
285         from librarian.elements.masters import Master
286         from librarian.elements.blocks import DlugiCytat, PoezjaCyt
287         from librarian.elements.footnotes import Footnote
288
289         if self.SECTION_PRECEDENCE:
290             assert isinstance(self.getparent(), (Master, DlugiCytat, PoezjaCyt, Footnote)), \
291                     'Header {} inside a <{}> instead of a master.'.format(
292                             etree.tostring(self, encoding='unicode'), self.getparent().tag)
293
294         for c in self:
295             if isinstance(c, WLElement):
296                 c.validate()
297
298
299     def sanitize(self):
300         # TODO: Remove insanity here.
301         for e in self:
302             if isinstance(e, WLElement):
303                 e.sanitize()
304
305     def snip(self, words, before=None, sub=False):
306         if sub and self.ASIDE:
307             return words, []
308
309         snippet = []
310         if before is not None:
311             i = self.index(before)
312         else:
313             i = len(self)
314
315         while i > 0:
316             i -= 1
317             if self[i].tail:
318                 if words:
319                     words, text = last_words(self[i].tail, words)
320                     snippet = [('text', text)] + snippet
321
322             if words:
323                 words, subsnip = self[i].snip(words, sub=True)
324                 snippet = subsnip + snippet
325
326         if words and self.text:
327             words, text = last_words(self.text, words)
328             snippet = [('text', text)] + snippet
329                     
330         snippet = [('start', self.tag, self.attrib)] + snippet + [('end',)]
331
332         if not sub and words and not self.ASIDE:
333             # do we dare go up?
334             parent = self.getparent()
335             if parent is not None and parent.CAN_HAVE_TEXT:
336                 words, parsnip = parent.snip(words, before=self)
337                 return words, parsnip[:-1] + snippet + parsnip[-1:]
338
339         return words, snippet
340
341     def get_snippet(self, words=15):
342         from librarian.parser import parser
343
344         words, snippet = self.getparent().snip(words=words, before=self)
345         
346         cursor = snipelem = parser.makeelement('snippet')
347         snipelem._meta_object = self.meta
348         for s in snippet:
349             if s[0] == 'start':
350                 elem = parser.makeelement(s[1], **s[2])
351                 cursor.append(elem)
352                 cursor = elem
353             elif s[0] == 'end':
354                 cursor = cursor.getparent()
355             else:
356                 if len(cursor):
357                     cursor[-1].tail = (cursor[-1].tail or '') + s[1]
358                 else:
359                     cursor.text = (cursor.text or '') + s[1]
360
361         return snipelem
362
363     @property
364     def numbering(self):
365         numbering = self.NUMBERING
366         if numbering is None or self.in_context_of('DISABLE_NUMBERING'):
367             return None
368         numbering = self.get_context_map('SUPPRESS_NUMBERING', numbering, numbering)
369         return numbering
370
371     @property
372     def id_prefix(self):
373         prefix = self.numbering
374         if prefix == 'main':
375             # TODO: self.context.main_numbering_prefix
376             prefix = 'f' # default numbering prefix
377         return prefix
378
379     def assign_id(self, document):
380         numbering = self.numbering
381         if numbering:
382             number = str(document.counters[numbering])
383             self.attrib['_id'] = self.id_prefix + number
384             document.counters[numbering] += 1
385
386             if numbering == 'main':
387                 self.attrib['_visible_numbering'] = str(document.counters['_visible'])
388                 document.counters['_visible'] += 1
389
390             if numbering == 'fn':
391                 self.attrib['_visible_numbering'] = number
392
393     def get_link(self):
394         return self.attrib.get('_id') or self.getparent().get_link()
395
396
397 class Snippet(WLElement):
398     pass