letters
[librarian.git] / src / librarian / html.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import io
5 import os
6 import re
7 import copy
8 import urllib.parse
9 import urllib.request
10
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
14 from PIL import Image
15
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
17
18
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
21
22 STYLESHEETS = {
23     'legacy': 'xslt/book2html.xslt',
24 }
25
26
27 def get_stylesheet(name):
28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
29
30
31 def html_has_content(text):
32     return etree.ETXPath(
33         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
34     )(text)
35
36
37 def transform_abstrakt(abstrakt_element):
38     style_filename = get_stylesheet('legacy')
39     style = etree.parse(style_filename)
40     xml = etree.tostring(abstrakt_element, encoding='unicode')
41     document = etree.parse(io.StringIO(
42         xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
43     ))  # HACK
44     result = document.xslt(style)
45     html = re.sub('<a name="sec[0-9]*"/>', '',
46                   etree.tostring(result, encoding='unicode'))
47     return re.sub('</?blockquote[^>]*>', '', html)
48
49
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51     widths = [360, 600, 1200, 1800, 2400]
52     try:
53         os.makedirs(gallery_path)
54     except:
55         pass
56
57     for i, ilustr in enumerate(tree.findall('//ilustr')):
58         rel_path = ilustr.attrib['src']
59         img_url = urllib.parse.urljoin(base_url, rel_path)
60
61         f = urllib.request.urlopen(img_url)
62         img = Image.open(f)
63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
64
65         srcset = []
66         # Needed widths: predefined and original, limited by
67         # whichever is smaller.
68         img_widths = [
69             w for w in
70             sorted(
71                 set(widths + [img.size[0]])
72             )
73             if w <= min(widths[-1], img.size[0])
74         ]
75         largest = None
76         for w in widths:
77             fname = '%d.W%d.%s' % (i, w, ext)
78             fpath = gallery_path + fname
79             if not os.path.exists(fpath):
80                 height = round(img.size[1] * w / img.size[0])
81                 th = img.resize((w, height))
82                 th.save(fpath)
83             th_url = gallery_url + fname
84             srcset.append(" ".join((
85                 th_url,
86                 '%dw' % w
87             )))
88             largest_url = th_url
89         ilustr.attrib['srcset'] = ", ".join(srcset)
90         ilustr.attrib['src'] = largest_url
91
92         f.close()
93
94
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96     """Transforms the WL document to XHTML.
97
98     If output_filename is None, returns an XML,
99     otherwise returns True if file has been written,False if it hasn't.
100     File won't be written if it has no content.
101     """
102     # Parse XSLT
103     try:
104         style_filename = get_stylesheet(stylesheet)
105         style = etree.parse(style_filename)
106
107         document = copy.deepcopy(wldoc)
108         del wldoc
109         document.swap_endlines()
110
111         if flags:
112             for flag in flags:
113                 document.edoc.getroot().set(flag, 'yes')
114
115         document.clean_ed_note()
116         document.clean_ed_note('abstrakt')
117         document.fix_pa_akap()
118         document.hebr_protect()
119         
120         if not options:
121             options = {}
122
123         try:
124             os.makedirs(gallery_path)
125         except OSError:
126             pass
127
128         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
129
130         css = (
131             css
132             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
133         )
134         css = "'%s'" % css
135         result = document.transform(style, css=css, **options)
136         del document  # no longer needed large object :)
137
138         if html_has_content(result):
139             add_anchors(result.getroot())
140             add_table_of_themes(result.getroot())
141             add_table_of_contents(result.getroot())
142
143             return OutputFile.from_bytes(etree.tostring(
144                 result, method='html', xml_declaration=False,
145                 pretty_print=True, encoding='utf-8'
146             ))
147         else:
148             return None
149     except KeyError:
150         raise ValueError("'%s' is not a valid stylesheet.")
151     except (XMLSyntaxError, XSLTApplyError) as e:
152         raise ParseError(e)
153
154
155 class Fragment:
156     def __init__(self, id, themes):
157         super(Fragment, self).__init__()
158         self.id = id
159         self.themes = themes
160         self.events = []
161
162     def append(self, event, element):
163         self.events.append((event, element))
164
165     def closed_events(self):
166         stack = []
167         for event, element in self.events:
168             if event == 'start':
169                 stack.append(('end', element))
170             elif event == 'end':
171                 try:
172                     stack.pop()
173                 except IndexError:
174                     print('CLOSED NON-OPEN TAG:', element)
175
176         stack.reverse()
177         return self.events + stack
178
179     def to_string(self):
180         result = []
181         for event, element in self.closed_events():
182             if event == 'start':
183                 result.append('<%s %s>' % (
184                     element.tag,
185                     ' '.join(
186                         '%s="%s"' % (k, v)
187                         for k, v in element.attrib.items()
188                     )
189                 ))
190                 if element.text:
191                     result.append(element.text)
192             elif event == 'end':
193                 result.append('</%s>' % element.tag)
194                 if element.tail:
195                     result.append(element.tail)
196             else:
197                 result.append(element)
198
199         return ''.join(result)
200
201     def __str__(self):
202         return self.to_string()
203
204
205 def extract_fragments(input_filename):
206     """Extracts theme fragments from input_filename."""
207     open_fragments = {}
208     closed_fragments = {}
209
210     # iterparse would die on a HTML document
211     parser = etree.HTMLParser(encoding='utf-8')
212     buf = io.BytesIO()
213     buf.write(etree.tostring(
214         etree.parse(input_filename, parser).getroot()[0][0],
215         encoding='utf-8'
216     ))
217     buf.seek(0)
218
219     for event, element in etree.iterparse(buf, events=('start', 'end')):
220         # Process begin and end elements
221         if element.get('class', '') in ('theme-begin', 'theme-end'):
222             if not event == 'end':
223                 continue  # Process elements only once, on end event
224
225             # Open new fragment
226             if element.get('class', '') == 'theme-begin':
227                 fragment = Fragment(id=element.get('fid'), themes=element.text)
228
229                 # Append parents
230                 parent = element.getparent()
231                 parents = []
232                 while parent.get('id', None) != 'book-text':
233                     cparent = copy.deepcopy(parent)
234                     cparent.text = None
235                     if 'id' in cparent.attrib:
236                         del cparent.attrib['id']
237                     parents.append(cparent)
238                     parent = parent.getparent()
239
240                 parents.reverse()
241                 for parent in parents:
242                     fragment.append('start', parent)
243
244                 if fragment.id not in open_fragments:
245                     open_fragments[fragment.id] = fragment
246
247             # Close existing fragment
248             else:
249                 try:
250                     fragment = open_fragments[element.get('fid')]
251                 except KeyError:
252                     print('%s:closed not open fragment #%s' % (
253                         input_filename, element.get('fid')
254                     ))
255                 else:
256                     closed_fragments[fragment.id] = fragment
257                     del open_fragments[fragment.id]
258
259             # Append element tail to lost_text
260             # (we don't want to lose any text)
261             if element.tail:
262                 for fragment_id in open_fragments:
263                     open_fragments[fragment_id].append('text', element.tail)
264
265         # Process all elements except begin and end
266         else:
267             # Omit annotation tags
268             if (len(element.get('name', '')) or
269                     element.get('class', '') in ('annotation', 'anchor')):
270                 if event == 'end' and element.tail:
271                     for fragment_id in open_fragments:
272                         open_fragments[fragment_id].append(
273                             'text', element.tail
274                         )
275             else:
276                 for fragment_id in open_fragments:
277                     celem = copy.copy(element)
278                     if 'id' in celem.attrib:
279                         del celem.attrib['id']
280                     open_fragments[fragment_id].append(
281                         event, celem
282                     )
283
284     return closed_fragments, open_fragments
285
286
287 def add_anchor(element, prefix, with_link=True, with_target=True,
288                link_text=None):
289     parent = element.getparent()
290     index = parent.index(element)
291
292     if with_link:
293         if link_text is None:
294             link_text = prefix
295         anchor = etree.Element('a', href='#%s' % prefix)
296         anchor.set('class', 'anchor')
297         anchor.text = str(link_text)
298         parent.insert(index, anchor)
299
300     if with_target:
301         anchor_target = etree.Element('a', name='%s' % prefix)
302         anchor_target.set('class', 'target')
303         anchor_target.text = ' '
304         parent.insert(index, anchor_target)
305
306
307 def any_ancestor(element, test):
308     for ancestor in element.iterancestors():
309         if test(ancestor):
310             return True
311     return False
312
313
314 def add_anchors(root):
315     link_prefix = "f"
316     counter = {"f": 1}
317     visible_counter = 1
318     for element in root.iterdescendants():
319         def f(e):
320             return (
321                 e.get('class') in (
322                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
323                 )
324                 or e.get('id') == 'nota_red'
325                 or e.tag == 'blockquote'
326                 or e.get('id') == 'footnotes'
327             )
328
329         if element.get('class') == 'numeracja':
330             try:
331                 visible_counter = int(element.get('data-start'))
332             except ValueError:
333                 visible_counter = 1
334             if element.get("data-link"):
335                 link_prefix = element.get("data-link")
336                 counter[link_prefix] = 1
337
338         if any_ancestor(element, f):
339             continue
340
341         if element.tag == 'div' and 'verse' in element.get('class', ''):
342             if visible_counter == 1 or visible_counter % 5 == 0:
343                 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
344             counter[link_prefix] += 1
345             visible_counter += 1
346         elif 'paragraph' in element.get('class', ''):
347             add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
348             counter[link_prefix] += 1
349             visible_counter += 1
350
351
352 def raw_printable_text(element):
353     working = copy.deepcopy(element)
354     for e in working.findall('a'):
355         if e.get('class') in ('annotation', 'theme-begin'):
356             e.text = ''
357     return etree.tostring(working, method='text', encoding='unicode').strip()
358
359
360 def add_table_of_contents(root):
361     sections = []
362     counter = 1
363     for element in root.iterdescendants():
364         if element.tag in ('h2', 'h3'):
365             if any_ancestor(
366                     element,
367                     lambda e: e.get('id') in (
368                         'footnotes', 'nota_red'
369                     ) or e.get('class') in ('person-list',)):
370                 continue
371
372             element_text = raw_printable_text(element)
373             if (element.tag == 'h3' and len(sections)
374                     and sections[-1][1] == 'h2'):
375                 sections[-1][3].append(
376                     (counter, element.tag, element_text, [])
377                 )
378             else:
379                 sections.append((counter, element.tag, element_text, []))
380             add_anchor(element, "s%d" % counter, with_link=False)
381             counter += 1
382
383     toc = etree.Element('div')
384     toc.set('id', 'toc')
385     toc_header = etree.SubElement(toc, 'h2')
386     toc_header.text = 'Spis treści'
387     toc_list = etree.SubElement(toc, 'ol')
388
389     for n, section, text, subsections in sections:
390         section_element = etree.SubElement(toc_list, 'li')
391         add_anchor(section_element, "s%d" % n, with_target=False,
392                    link_text=text)
393
394         if len(subsections):
395             subsection_list = etree.SubElement(section_element, 'ol')
396             for n1, subsection, subtext, _ in subsections:
397                 subsection_element = etree.SubElement(subsection_list, 'li')
398                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
399                            link_text=subtext)
400
401     root.insert(0, toc)
402
403
404 def add_table_of_themes(root):
405     try:
406         from sortify import sortify
407     except ImportError:
408         def sortify(x):
409             return x
410
411     book_themes = {}
412     for fragment in root.findall('.//a[@class="theme-begin"]'):
413         if not fragment.text:
414             continue
415         theme_names = [s.strip() for s in fragment.text.split(',')]
416         for theme_name in theme_names:
417             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
418     book_themes = list(book_themes.items())
419     book_themes.sort(key=lambda s: sortify(s[0]))
420     themes_div = etree.Element('div', id="themes")
421     themes_ol = etree.SubElement(themes_div, 'ol')
422     for theme_name, fragments in book_themes:
423         themes_li = etree.SubElement(themes_ol, 'li')
424         themes_li.text = "%s: " % theme_name
425         for i, fragment in enumerate(fragments):
426             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
427             item.text = str(i + 1)
428             item.tail = ' '
429     root.insert(0, themes_div)
430
431
432 def extract_annotations(html_path):
433     """Extracts annotations from HTML for annotations dictionary.
434
435     For each annotation, yields a tuple of:
436     anchor, footnote type, valid qualifiers, text, html.
437
438     """
439     from .fn_qualifiers import FN_QUALIFIERS
440
441     parser = etree.HTMLParser(encoding='utf-8')
442     tree = etree.parse(html_path, parser)
443     footnotes = tree.find('//*[@id="footnotes"]')
444     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
445     if footnotes is not None:
446         for footnote in footnotes.findall('div'):
447             fn_type = footnote.get('class').split('-')[1]
448             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
449             del footnote[:2]
450             footnote.text = None
451             if len(footnote) and footnote[-1].tail == '\n':
452                 footnote[-1].tail = None
453             text_str = etree.tostring(footnote, method='text',
454                                       encoding='unicode').strip()
455             html_str = etree.tostring(footnote, method='html',
456                                       encoding='unicode').strip()
457
458             match = re_qualifier.match(text_str)
459             if match:
460                 qualifier_str = match.group(1)
461                 qualifiers = []
462                 for candidate in re.split('[;,]', qualifier_str):
463                     candidate = candidate.strip()
464                     if candidate in FN_QUALIFIERS:
465                         qualifiers.append(candidate)
466                     elif candidate.startswith('z '):
467                         subcandidate = candidate.split()[1]
468                         if subcandidate in FN_QUALIFIERS:
469                             qualifiers.append(subcandidate)
470             else:
471                 qualifiers = []
472
473             yield anchor, fn_type, qualifiers, text_str, html_str