Move HTML from the old transform sheet.
[librarian.git] / src / librarian / elements / base.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import copy
5 import re
6 from lxml import etree
7 from librarian import dcparser, RDFNS
8 from librarian.util import get_translation
9
10 def last_words(text, n):
11     words = []
12     for w in reversed(text.split()):
13         words.append(w)
14         if len(w) > 2:
15             n -= 1
16             if not n: break
17     if n:
18         return n, text
19     else:
20         return n, ' '.join(reversed(words))
21
22
23 class WLElement(etree.ElementBase):
24     SECTION_PRECEDENCE = None
25     ASIDE = False
26
27     TXT_TOP_MARGIN = 0
28     TXT_BOTTOM_MARGIN = 0
29     TXT_PREFIX = ""
30     TXT_SUFFIX = ""
31
32     HTML_TAG = None
33     HTML_ATTR = {}
34     HTML_CLASS = None
35
36     EPUB_TAG = None
37     EPUB_ATTR = {}
38     EPUB_CLASS = None
39     EPUB_START_CHUNK = False
40    
41     CAN_HAVE_TEXT = True
42     STRIP = False
43     NUMBERING = None
44
45     text_substitutions = [
46         ('---', '—'),
47         ('--', '–'),
48         #('...', '…'),  # Temporary turnoff for epub
49         (',,', '„'),
50         ('"', '”'),
51         ('\ufeff', ''),
52
53         ("'", "\u2019"),    # This was enabled for epub.
54     ]
55
56     @property
57     def meta_object(self):
58         if not hasattr(self, '_meta_object'):
59             elem = self.find(RDFNS('RDF'))
60             if elem is not None:
61                 self._meta_object = dcparser.BookInfo.from_element(elem)
62             else:
63                 self._meta_object = None
64         return self._meta_object
65
66     @property
67     def meta(self):
68         if self.meta_object is not None:
69             return self.meta_object
70         else:
71             if self.getparent() is not None:
72                 return self.getparent().meta
73             else:
74                 return self.document.base_meta
75
76     @property
77     def gettext(self):
78         return get_translation(self.meta.language).gettext
79
80     def in_context_of(self, setting):
81         parent = self.getparent()
82         if parent is None:
83             return False
84         try:
85             return getattr(parent, setting)
86         except AttributeError:
87             return parent.in_context_of(setting)
88
89     def get_context_map(self, setting, key, default=None):
90         parent = self.getparent()
91         if parent is None:
92             return default
93         try:
94             return getattr(parent, setting)[key]
95         except AttributeError:
96             return parent.get_context_map(setting, key, default)
97
98     def signal(self, signal):
99         parent = self.getparent()
100         if parent is not None:
101             parent.signal(signal)
102     
103     def raw_printable_text(self, builder):
104         from librarian.html import raw_printable_text
105
106         # TODO: podtagi, wyroznienia, etc
107         t = ''
108         t += self.normalize_text(self.text, builder)
109         for c in self:
110             if not isinstance(c, WLElement):
111                 continue
112             if c.tag not in ('pe', 'pa', 'pt', 'pr', 'motyw'):
113                 t += c.raw_printable_text(builder)
114             t += self.normalize_text(c.tail, builder)
115         return t
116     
117     def normalize_text(self, text, builder):
118         text = text or ''
119         for e, s in self.text_substitutions:
120             text = text.replace(e, s)
121             # FIXME: TEmporary turnoff
122 #        text = re.sub(r'\s+', ' ', text)
123 ### TODO: Added now for epub
124
125         if getattr(builder, 'hyphenator', None) is not None:
126             newt = ''
127             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(text)
128             for w in wlist:
129                 newt += builder.hyphenator.inserted(w, '\u00AD')
130             text = newt
131
132         if builder.orphans:
133             text = re.sub(r'(?<=\s\w)\s+', '\u00A0', text)
134
135         return text
136
137     def _build_inner(self, builder, build_method):
138         child_count = len(self)
139         if self.CAN_HAVE_TEXT and self.text:
140             text = self.normalize_text(self.text, builder)
141             if self.STRIP:
142                 text = text.lstrip()
143                 if not child_count:
144                     text = text.rstrip()
145             builder.push_text(text)
146         for i, child in enumerate(self):
147             if isinstance(child, WLElement):
148                 getattr(child, build_method)(builder)
149             # FIXME base builder api
150             elif getattr(builder, 'debug', False) and child.tag is etree.Comment:
151                 builder.process_comment(child)
152             if self.CAN_HAVE_TEXT and child.tail:
153                 text = self.normalize_text(child.tail, builder)
154                 if self.STRIP and i == child_count - 1:
155                     text = text.rstrip()
156                 builder.push_text(text)
157
158     def _txt_build_inner(self, builder):
159         self._build_inner(builder, 'txt_build')
160
161     def txt_build(self, builder):
162         if hasattr(self, 'TXT_LEGACY_TOP_MARGIN'):
163             builder.push_legacy_margin(self.TXT_LEGACY_TOP_MARGIN)
164         else:
165             builder.push_margin(self.TXT_TOP_MARGIN)
166         builder.push_text(self.TXT_PREFIX, True)
167         self._txt_build_inner(builder)
168         builder.push_text(self.TXT_SUFFIX, True)
169         if hasattr(self, 'TXT_LEGACY_BOTTOM_MARGIN'):
170             builder.push_legacy_margin(self.TXT_LEGACY_BOTTOM_MARGIN)
171         else:
172             builder.push_margin(self.TXT_BOTTOM_MARGIN)
173
174     def _html_build_inner(self, builder):
175         self._build_inner(builder, 'html_build')
176
177     def get_html_attr(self, builder):
178         attr = self.HTML_ATTR.copy()
179         if self.HTML_CLASS:
180             attr['class'] = self.HTML_CLASS
181         if builder.with_ids:
182             # always copy the id attribute (?)
183             if self.attrib.get('id'):
184                 attr['id'] = self.attrib['id']
185             if self.attrib.get('_id'):
186                 attr['id'] = self.attrib['_id']
187         return attr
188
189     def html_build(self, builder):
190         # Do we need a number?
191         numbering = self.numbering
192         if numbering == 'main':
193             if builder.with_numbering and self.has_visible_numbering:
194                 builder.add_visible_number(self)
195
196         if self.HTML_TAG:
197             builder.start_element(
198                 self.HTML_TAG,
199                 self.get_html_attr(builder),
200             )
201
202         self._html_build_inner(builder)
203         if self.HTML_TAG:
204             builder.end_element()
205
206     def _epub_build_inner(self, builder):
207         self._build_inner(builder, 'epub_build')
208
209     def get_epub_attr(self, builder):
210         attr = self.EPUB_ATTR.copy()
211         if self.EPUB_CLASS:
212             attr['class'] = self.EPUB_CLASS
213         return attr
214
215     def epub_build(self, builder):
216         from librarian.elements.masters import Master
217
218         # TEMPORARY
219         self.CAN_HAVE_TEXT = True
220         self.STRIP = False
221
222         start_chunk = self.EPUB_START_CHUNK and isinstance(self.getparent(), Master)
223
224         if start_chunk:
225             builder.start_chunk()
226
227         fragment = None
228         if self.SECTION_PRECEDENCE and not self.in_context_of('NO_TOC'):
229             if not start_chunk:
230                 fragment = 'sub%d' % builder.assign_section_number()
231                 self.attrib['id'] = fragment
232
233             builder.add_toc_entry(
234                 fragment,
235                 self.raw_printable_text(builder),
236                 self.SECTION_PRECEDENCE
237             )
238             
239         if self.EPUB_TAG:
240             attr = self.get_epub_attr(builder)
241             if fragment:
242                 attr['id'] = fragment
243             if builder.debug:
244                 chunkno, sourceline = 0, self.sourceline
245                 if builder.splits:
246                     chunkno, sourceline = len(builder.splits), sourceline - builder.splits[-1]
247                 attr['data-debug'] = f'{chunkno}:{sourceline}'
248             builder.start_element(
249                 self.EPUB_TAG,
250                 attr
251             )
252
253         self._epub_build_inner(builder)
254         if self.EPUB_TAG:
255             builder.end_element()
256
257     def validate(self):
258         from librarian.elements.masters import Master
259         from librarian.elements.blocks import DlugiCytat, PoezjaCyt
260         from librarian.elements.footnotes import Footnote
261
262         if self.SECTION_PRECEDENCE:
263             assert isinstance(self.getparent(), (Master, DlugiCytat, PoezjaCyt, Footnote)), \
264                     'Header {} inside a <{}> instead of a master.'.format(
265                             etree.tostring(self, encoding='unicode'), self.getparent().tag)
266
267         for c in self:
268             if isinstance(c, WLElement):
269                 c.validate()
270
271
272     def sanitize(self):
273         # TODO: Remove insanity here.
274         for e in self:
275             if isinstance(e, WLElement):
276                 e.sanitize()
277
278     def snip(self, words, before=None, sub=False):
279         if sub and self.ASIDE:
280             return words, []
281
282         snippet = []
283         if before is not None:
284             i = self.index(before)
285         else:
286             i = len(self)
287
288         while i > 0:
289             i -= 1
290             if self[i].tail:
291                 if words:
292                     words, text = last_words(self[i].tail, words)
293                     snippet = [('text', text)] + snippet
294
295             if words:
296                 words, subsnip = self[i].snip(words, sub=True)
297                 snippet = subsnip + snippet
298
299         if words and self.text:
300             words, text = last_words(self.text, words)
301             snippet = [('text', text)] + snippet
302                     
303         snippet = [('start', self.tag, self.attrib)] + snippet + [('end',)]
304
305         if not sub and words and not self.ASIDE:
306             # do we dare go up?
307             parent = self.getparent()
308             if parent is not None and parent.CAN_HAVE_TEXT:
309                 words, parsnip = parent.snip(words, before=self)
310                 return words, parsnip[:-1] + snippet + parsnip[-1:]
311
312         return words, snippet
313
314     def get_snippet(self, words=15):
315         from librarian.parser import parser
316
317         words, snippet = self.getparent().snip(words=words, before=self)
318         
319         cursor = snipelem = parser.makeelement('snippet')
320         snipelem._meta_object = self.meta
321         for s in snippet:
322             if s[0] == 'start':
323                 elem = parser.makeelement(s[1], **s[2])
324                 cursor.append(elem)
325                 cursor = elem
326             elif s[0] == 'end':
327                 cursor = cursor.getparent()
328             else:
329                 if len(cursor):
330                     cursor[-1].tail = (cursor[-1].tail or '') + s[1]
331                 else:
332                     cursor.text = (cursor.text or '') + s[1]
333
334         return snipelem
335
336     @property
337     def numbering(self):
338         numbering = self.NUMBERING
339         if numbering is None or self.in_context_of('DISABLE_NUMBERING'):
340             return None
341         numbering = self.get_context_map('SUPPRESS_NUMBERING', numbering, numbering)
342         return numbering
343
344     @property
345     def id_prefix(self):
346         prefix = self.numbering
347         if prefix == 'main':
348             # TODO: self.context.main_numbering_prefix
349             prefix = 'f' # default numbering prefix
350         return prefix
351
352     def assign_id(self, builder):
353         numbering = self.numbering
354         if numbering:
355             number = str(builder.counters[numbering])
356             self.attrib['_id'] = self.id_prefix + number
357             builder.counters[numbering] += 1
358
359             if numbering == 'main':
360                 self.attrib['_visible_numbering'] = str(builder.counters['_visible'])
361                 builder.counters['_visible'] += 1
362
363             if numbering == 'fn':
364                 self.attrib['_visible_numbering'] = number
365
366     def get_link(self):
367         return self.attrib.get('_id') or self.getparent().get_link()
368
369
370 class Snippet(WLElement):
371     pass