Add main thema field. Move to calver.
[librarian.git] / src / librarian / html.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import io
5 import os
6 import re
7 import copy
8 import urllib.parse
9 import urllib.request
10
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
14 from PIL import Image
15
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
17
18
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
21
22 STYLESHEETS = {
23     'legacy': 'xslt/book2html.xslt',
24 }
25
26
27 def get_stylesheet(name):
28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
29
30
31 def html_has_content(text):
32     return etree.ETXPath(
33         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
34     )(text)
35
36
37 def transform_abstrakt(abstrakt_element):
38     style_filename = get_stylesheet('legacy')
39     style = etree.parse(style_filename)
40     xml = etree.tostring(abstrakt_element, encoding='unicode')
41     document = etree.parse(io.StringIO(
42         xml.replace('abstrakt', 'dlugi_cytat')
43     ))  # HACK
44     result = document.xslt(style)
45     html = re.sub('<a name="sec[0-9]*"/>', '',
46                   etree.tostring(result, encoding='unicode'))
47     return re.sub('</?blockquote[^>]*>', '', html)
48
49
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51     widths = [360, 600, 1200, 1800, 2400]
52
53     for i, ilustr in enumerate(tree.findall('//ilustr')):
54         rel_path = ilustr.attrib['src']
55         img_url = urllib.parse.urljoin(base_url, rel_path)
56
57         f = urllib.request.urlopen(img_url)
58         img = Image.open(f)
59         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
60
61         srcset = []
62         # Needed widths: predefined and original, limited by
63         # whichever is smaller.
64         img_widths = [
65             w for w in
66             sorted(
67                 set(widths + [img.size[0]])
68             )
69             if w <= min(widths[-1], img.size[0])
70         ]
71         largest = None
72         for w in widths:
73             fname = '%d.W%d.%s' % (i, w, ext)
74             fpath = gallery_path + fname
75             if not os.path.exists(fpath):
76                 height = round(img.size[1] * w / img.size[0])
77                 th = img.resize((w, height))
78                 th.save(fpath)
79             th_url = gallery_url + fname
80             srcset.append(" ".join((
81                 th_url,
82                 '%dw' % w
83             )))
84             largest_url = th_url
85         ilustr.attrib['srcset'] = ", ".join(srcset)
86         ilustr.attrib['src'] = largest_url
87
88         f.close()
89
90
91 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
92     """Transforms the WL document to XHTML.
93
94     If output_filename is None, returns an XML,
95     otherwise returns True if file has been written,False if it hasn't.
96     File won't be written if it has no content.
97     """
98     # Parse XSLT
99     try:
100         style_filename = get_stylesheet(stylesheet)
101         style = etree.parse(style_filename)
102
103         document = copy.deepcopy(wldoc)
104         del wldoc
105         document.swap_endlines()
106
107         if flags:
108             for flag in flags:
109                 document.edoc.getroot().set(flag, 'yes')
110
111         document.clean_ed_note()
112         document.clean_ed_note('abstrakt')
113         document.fix_pa_akap()
114         
115         if not options:
116             options = {}
117
118         try:
119             os.makedirs(gallery_path)
120         except OSError:
121             pass
122
123         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
124
125         css = (
126             css
127             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
128         )
129         css = "'%s'" % css
130         result = document.transform(style, css=css, **options)
131         del document  # no longer needed large object :)
132
133         if html_has_content(result):
134             add_anchors(result.getroot())
135             add_table_of_themes(result.getroot())
136             add_table_of_contents(result.getroot())
137
138             return OutputFile.from_bytes(etree.tostring(
139                 result, method='html', xml_declaration=False,
140                 pretty_print=True, encoding='utf-8'
141             ))
142         else:
143             return None
144     except KeyError:
145         raise ValueError("'%s' is not a valid stylesheet.")
146     except (XMLSyntaxError, XSLTApplyError) as e:
147         raise ParseError(e)
148
149
150 class Fragment:
151     def __init__(self, id, themes):
152         super(Fragment, self).__init__()
153         self.id = id
154         self.themes = themes
155         self.events = []
156
157     def append(self, event, element):
158         self.events.append((event, element))
159
160     def closed_events(self):
161         stack = []
162         for event, element in self.events:
163             if event == 'start':
164                 stack.append(('end', element))
165             elif event == 'end':
166                 try:
167                     stack.pop()
168                 except IndexError:
169                     print('CLOSED NON-OPEN TAG:', element)
170
171         stack.reverse()
172         return self.events + stack
173
174     def to_string(self):
175         result = []
176         for event, element in self.closed_events():
177             if event == 'start':
178                 result.append('<%s %s>' % (
179                     element.tag,
180                     ' '.join(
181                         '%s="%s"' % (k, v)
182                         for k, v in element.attrib.items()
183                     )
184                 ))
185                 if element.text:
186                     result.append(element.text)
187             elif event == 'end':
188                 result.append('</%s>' % element.tag)
189                 if element.tail:
190                     result.append(element.tail)
191             else:
192                 result.append(element)
193
194         return ''.join(result)
195
196     def __str__(self):
197         return self.to_string()
198
199
200 def extract_fragments(input_filename):
201     """Extracts theme fragments from input_filename."""
202     open_fragments = {}
203     closed_fragments = {}
204
205     # iterparse would die on a HTML document
206     parser = etree.HTMLParser(encoding='utf-8')
207     buf = io.BytesIO()
208     buf.write(etree.tostring(
209         etree.parse(input_filename, parser).getroot()[0][0],
210         encoding='utf-8'
211     ))
212     buf.seek(0)
213
214     for event, element in etree.iterparse(buf, events=('start', 'end')):
215         # Process begin and end elements
216         if element.get('class', '') in ('theme-begin', 'theme-end'):
217             if not event == 'end':
218                 continue  # Process elements only once, on end event
219
220             # Open new fragment
221             if element.get('class', '') == 'theme-begin':
222                 fragment = Fragment(id=element.get('fid'), themes=element.text)
223
224                 # Append parents
225                 parent = element.getparent()
226                 parents = []
227                 while parent.get('id', None) != 'book-text':
228                     cparent = copy.deepcopy(parent)
229                     cparent.text = None
230                     if 'id' in cparent.attrib:
231                         del cparent.attrib['id']
232                     parents.append(cparent)
233                     parent = parent.getparent()
234
235                 parents.reverse()
236                 for parent in parents:
237                     fragment.append('start', parent)
238
239                 if fragment.id not in open_fragments:
240                     open_fragments[fragment.id] = fragment
241
242             # Close existing fragment
243             else:
244                 try:
245                     fragment = open_fragments[element.get('fid')]
246                 except KeyError:
247                     print('%s:closed not open fragment #%s' % (
248                         input_filename, element.get('fid')
249                     ))
250                 else:
251                     closed_fragments[fragment.id] = fragment
252                     del open_fragments[fragment.id]
253
254             # Append element tail to lost_text
255             # (we don't want to lose any text)
256             if element.tail:
257                 for fragment_id in open_fragments:
258                     open_fragments[fragment_id].append('text', element.tail)
259
260         # Process all elements except begin and end
261         else:
262             # Omit annotation tags
263             if (len(element.get('name', '')) or
264                     element.get('class', '') in ('annotation', 'anchor')):
265                 if event == 'end' and element.tail:
266                     for fragment_id in open_fragments:
267                         open_fragments[fragment_id].append(
268                             'text', element.tail
269                         )
270             else:
271                 for fragment_id in open_fragments:
272                     celem = copy.copy(element)
273                     if 'id' in celem.attrib:
274                         del celem.attrib['id']
275                     open_fragments[fragment_id].append(
276                         event, celem
277                     )
278
279     return closed_fragments, open_fragments
280
281
282 def add_anchor(element, prefix, with_link=True, with_target=True,
283                link_text=None):
284     parent = element.getparent()
285     index = parent.index(element)
286
287     if with_link:
288         if link_text is None:
289             link_text = prefix
290         anchor = etree.Element('a', href='#%s' % prefix)
291         anchor.set('class', 'anchor')
292         anchor.text = str(link_text)
293         parent.insert(index, anchor)
294
295     if with_target:
296         anchor_target = etree.Element('a', name='%s' % prefix)
297         anchor_target.set('class', 'target')
298         anchor_target.text = ' '
299         parent.insert(index, anchor_target)
300
301
302 def any_ancestor(element, test):
303     for ancestor in element.iterancestors():
304         if test(ancestor):
305             return True
306     return False
307
308
309 def add_anchors(root):
310     counter = 1
311     visible_counter = 1
312     for element in root.iterdescendants():
313         def f(e):
314             return (
315                 e.get('class') in (
316                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
317                 )
318                 or e.get('id') == 'nota_red'
319                 or e.tag == 'blockquote'
320                 or e.get('id') == 'footnotes'
321             )
322
323         if element.get('class') == 'numeracja':
324             try:
325                 visible_counter = int(element.get('data-start'))
326             except ValueError:
327                 visible_counter = 1
328
329         if any_ancestor(element, f):
330             continue
331
332         if element.tag == 'div' and 'verse' in element.get('class', ''):
333             if visible_counter == 1 or visible_counter % 5 == 0:
334                 add_anchor(element, "f%d" % counter, link_text=visible_counter)
335             counter += 1
336             visible_counter += 1
337         elif 'paragraph' in element.get('class', ''):
338             add_anchor(element, "f%d" % counter, link_text=visible_counter)
339             counter += 1
340             visible_counter += 1
341
342
343 def raw_printable_text(element):
344     working = copy.deepcopy(element)
345     for e in working.findall('a'):
346         if e.get('class') in ('annotation', 'theme-begin'):
347             e.text = ''
348     return etree.tostring(working, method='text', encoding='unicode').strip()
349
350
351 def add_table_of_contents(root):
352     sections = []
353     counter = 1
354     for element in root.iterdescendants():
355         if element.tag in ('h2', 'h3'):
356             if any_ancestor(
357                     element,
358                     lambda e: e.get('id') in (
359                         'footnotes', 'nota_red'
360                     ) or e.get('class') in ('person-list',)):
361                 continue
362
363             element_text = raw_printable_text(element)
364             if (element.tag == 'h3' and len(sections)
365                     and sections[-1][1] == 'h2'):
366                 sections[-1][3].append(
367                     (counter, element.tag, element_text, [])
368                 )
369             else:
370                 sections.append((counter, element.tag, element_text, []))
371             add_anchor(element, "s%d" % counter, with_link=False)
372             counter += 1
373
374     toc = etree.Element('div')
375     toc.set('id', 'toc')
376     toc_header = etree.SubElement(toc, 'h2')
377     toc_header.text = 'Spis treści'
378     toc_list = etree.SubElement(toc, 'ol')
379
380     for n, section, text, subsections in sections:
381         section_element = etree.SubElement(toc_list, 'li')
382         add_anchor(section_element, "s%d" % n, with_target=False,
383                    link_text=text)
384
385         if len(subsections):
386             subsection_list = etree.SubElement(section_element, 'ol')
387             for n1, subsection, subtext, _ in subsections:
388                 subsection_element = etree.SubElement(subsection_list, 'li')
389                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
390                            link_text=subtext)
391
392     root.insert(0, toc)
393
394
395 def add_table_of_themes(root):
396     try:
397         from sortify import sortify
398     except ImportError:
399         def sortify(x):
400             return x
401
402     book_themes = {}
403     for fragment in root.findall('.//a[@class="theme-begin"]'):
404         if not fragment.text:
405             continue
406         theme_names = [s.strip() for s in fragment.text.split(',')]
407         for theme_name in theme_names:
408             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
409     book_themes = list(book_themes.items())
410     book_themes.sort(key=lambda s: sortify(s[0]))
411     themes_div = etree.Element('div', id="themes")
412     themes_ol = etree.SubElement(themes_div, 'ol')
413     for theme_name, fragments in book_themes:
414         themes_li = etree.SubElement(themes_ol, 'li')
415         themes_li.text = "%s: " % theme_name
416         for i, fragment in enumerate(fragments):
417             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
418             item.text = str(i + 1)
419             item.tail = ' '
420     root.insert(0, themes_div)
421
422
423 def extract_annotations(html_path):
424     """Extracts annotations from HTML for annotations dictionary.
425
426     For each annotation, yields a tuple of:
427     anchor, footnote type, valid qualifiers, text, html.
428
429     """
430     from .fn_qualifiers import FN_QUALIFIERS
431
432     parser = etree.HTMLParser(encoding='utf-8')
433     tree = etree.parse(html_path, parser)
434     footnotes = tree.find('//*[@id="footnotes"]')
435     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
436     if footnotes is not None:
437         for footnote in footnotes.findall('div'):
438             fn_type = footnote.get('class').split('-')[1]
439             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
440             del footnote[:2]
441             footnote.text = None
442             if len(footnote) and footnote[-1].tail == '\n':
443                 footnote[-1].tail = None
444             text_str = etree.tostring(footnote, method='text',
445                                       encoding='unicode').strip()
446             html_str = etree.tostring(footnote, method='html',
447                                       encoding='unicode').strip()
448
449             match = re_qualifier.match(text_str)
450             if match:
451                 qualifier_str = match.group(1)
452                 qualifiers = []
453                 for candidate in re.split('[;,]', qualifier_str):
454                     candidate = candidate.strip()
455                     if candidate in FN_QUALIFIERS:
456                         qualifiers.append(candidate)
457                     elif candidate.startswith('z '):
458                         subcandidate = candidate.split()[1]
459                         if subcandidate in FN_QUALIFIERS:
460                             qualifiers.append(subcandidate)
461             else:
462                 qualifiers = []
463
464             yield anchor, fn_type, qualifiers, text_str, html_str