fixes for edge cases
[librarian.git] / src / librarian / html.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import print_function, unicode_literals
7
8 import os
9 import re
10 import copy
11
12 from lxml import etree
13 from librarian import XHTMLNS, ParseError, OutputFile
14 from librarian import functions
15 from PIL import Image
16
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
18 import six
19
20
21 functions.reg_substitute_entities()
22 functions.reg_person_name()
23
24 STYLESHEETS = {
25     'legacy': 'xslt/book2html.xslt',
26     'full': 'xslt/wl2html_full.xslt',
27     'partial': 'xslt/wl2html_partial.xslt'
28 }
29
30
31 def get_stylesheet(name):
32     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
33
34
35 def html_has_content(text):
36     return etree.ETXPath(
37         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
38     )(text)
39
40
41 def transform_abstrakt(abstrakt_element):
42     style_filename = get_stylesheet('legacy')
43     style = etree.parse(style_filename)
44     xml = etree.tostring(abstrakt_element, encoding='unicode')
45     document = etree.parse(six.StringIO(
46         xml.replace('abstrakt', 'dlugi_cytat')
47     ))  # HACK
48     result = document.xslt(style)
49     html = re.sub('<a name="sec[0-9]*"/>', '',
50                   etree.tostring(result, encoding='unicode'))
51     return re.sub('</?blockquote[^>]*>', '', html)
52
53
54 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
55     widths = [360, 600, 1200, 1800, 2400]
56
57     for i, ilustr in enumerate(tree.findall('//ilustr')):
58         rel_path = ilustr.attrib['src']
59         img_url = six.moves.urllib.parse.urljoin(base_url, rel_path)
60
61         f = six.moves.urllib.request.urlopen(img_url)
62         img = Image.open(f)
63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
64
65         srcset = []
66         # Needed widths: predefined and original, limited by
67         # whichever is smaller.
68         img_widths = [
69             w for w in
70             sorted(
71                 set(widths + [img.size[0]])
72             )
73             if w <= min(widths[-1], img.size[0])
74         ]
75         largest = None
76         for w in widths:
77             fname = '%d.W%d.%s' % (i, w, ext)
78             fpath = gallery_path + fname
79             if not os.path.exists(fpath):
80                 height = round(img.size[1] * w / img.size[0])
81                 th = img.resize((w, height))
82                 th.save(fpath)
83             th_url = gallery_url + fname
84             srcset.append(" ".join((
85                 th_url,
86                 '%dw' % w
87             )))
88             largest_url = th_url
89         ilustr.attrib['srcset'] = ", ".join(srcset)
90         ilustr.attrib['src'] = largest_url
91
92         f.close()
93
94
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96     """Transforms the WL document to XHTML.
97
98     If output_filename is None, returns an XML,
99     otherwise returns True if file has been written,False if it hasn't.
100     File won't be written if it has no content.
101     """
102     # Parse XSLT
103     try:
104         style_filename = get_stylesheet(stylesheet)
105         style = etree.parse(style_filename)
106
107         document = copy.deepcopy(wldoc)
108         del wldoc
109         document.swap_endlines()
110
111         if flags:
112             for flag in flags:
113                 document.edoc.getroot().set(flag, 'yes')
114
115         document.clean_ed_note()
116         document.clean_ed_note('abstrakt')
117         document.fix_pa_akap()
118         
119         if not options:
120             options = {}
121
122         try:
123             os.makedirs(gallery_path)
124         except OSError:
125             pass
126
127         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
128
129         css = (
130             css
131             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
132         )
133         css = "'%s'" % css
134         result = document.transform(style, css=css, **options)
135         del document  # no longer needed large object :)
136
137         if html_has_content(result):
138             add_anchors(result.getroot())
139             add_table_of_themes(result.getroot())
140             add_table_of_contents(result.getroot())
141
142             return OutputFile.from_bytes(etree.tostring(
143                 result, method='html', xml_declaration=False,
144                 pretty_print=True, encoding='utf-8'
145             ))
146         else:
147             return None
148     except KeyError:
149         raise ValueError("'%s' is not a valid stylesheet.")
150     except (XMLSyntaxError, XSLTApplyError) as e:
151         raise ParseError(e)
152
153
154 @six.python_2_unicode_compatible
155 class Fragment(object):
156     def __init__(self, id, themes):
157         super(Fragment, self).__init__()
158         self.id = id
159         self.themes = themes
160         self.events = []
161
162     def append(self, event, element):
163         self.events.append((event, element))
164
165     def closed_events(self):
166         stack = []
167         for event, element in self.events:
168             if event == 'start':
169                 stack.append(('end', element))
170             elif event == 'end':
171                 try:
172                     stack.pop()
173                 except IndexError:
174                     print('CLOSED NON-OPEN TAG:', element)
175
176         stack.reverse()
177         return self.events + stack
178
179     def to_string(self):
180         result = []
181         for event, element in self.closed_events():
182             if event == 'start':
183                 result.append(u'<%s %s>' % (
184                     element.tag,
185                     ' '.join(
186                         '%s="%s"' % (k, v)
187                         for k, v in element.attrib.items()
188                     )
189                 ))
190                 if element.text:
191                     result.append(element.text)
192             elif event == 'end':
193                 result.append(u'</%s>' % element.tag)
194                 if element.tail:
195                     result.append(element.tail)
196             else:
197                 result.append(element)
198
199         return ''.join(result)
200
201     def __str__(self):
202         return self.to_string()
203
204
205 def extract_fragments(input_filename):
206     """Extracts theme fragments from input_filename."""
207     open_fragments = {}
208     closed_fragments = {}
209
210     # iterparse would die on a HTML document
211     parser = etree.HTMLParser(encoding='utf-8')
212     buf = six.BytesIO()
213     buf.write(etree.tostring(
214         etree.parse(input_filename, parser).getroot()[0][0],
215         encoding='utf-8'
216     ))
217     buf.seek(0)
218
219     for event, element in etree.iterparse(buf, events=('start', 'end')):
220         # Process begin and end elements
221         if element.get('class', '') in ('theme-begin', 'theme-end'):
222             if not event == 'end':
223                 continue  # Process elements only once, on end event
224
225             # Open new fragment
226             if element.get('class', '') == 'theme-begin':
227                 fragment = Fragment(id=element.get('fid'), themes=element.text)
228
229                 # Append parents
230                 parent = element.getparent()
231                 parents = []
232                 while parent.get('id', None) != 'book-text':
233                     cparent = copy.deepcopy(parent)
234                     cparent.text = None
235                     if 'id' in cparent.attrib:
236                         del cparent.attrib['id']
237                     parents.append(cparent)
238                     parent = parent.getparent()
239
240                 parents.reverse()
241                 for parent in parents:
242                     fragment.append('start', parent)
243
244                 if fragment.id not in open_fragments:
245                     open_fragments[fragment.id] = fragment
246
247             # Close existing fragment
248             else:
249                 try:
250                     fragment = open_fragments[element.get('fid')]
251                 except KeyError:
252                     print('%s:closed not open fragment #%s' % (
253                         input_filename, element.get('fid')
254                     ))
255                 else:
256                     closed_fragments[fragment.id] = fragment
257                     del open_fragments[fragment.id]
258
259             # Append element tail to lost_text
260             # (we don't want to lose any text)
261             if element.tail:
262                 for fragment_id in open_fragments:
263                     open_fragments[fragment_id].append('text', element.tail)
264
265         # Process all elements except begin and end
266         else:
267             # Omit annotation tags
268             if (len(element.get('name', '')) or
269                     element.get('class', '') in ('annotation', 'anchor')):
270                 if event == 'end' and element.tail:
271                     for fragment_id in open_fragments:
272                         open_fragments[fragment_id].append(
273                             'text', element.tail
274                         )
275             else:
276                 for fragment_id in open_fragments:
277                     celem = copy.copy(element)
278                     if 'id' in celem.attrib:
279                         del celem.attrib['id']
280                     open_fragments[fragment_id].append(
281                         event, celem
282                     )
283
284     return closed_fragments, open_fragments
285
286
287 def add_anchor(element, prefix, with_link=True, with_target=True,
288                link_text=None):
289     parent = element.getparent()
290     index = parent.index(element)
291
292     if with_link:
293         if link_text is None:
294             link_text = prefix
295         anchor = etree.Element('a', href='#%s' % prefix)
296         anchor.set('class', 'anchor')
297         anchor.text = six.text_type(link_text)
298         parent.insert(index, anchor)
299
300     if with_target:
301         anchor_target = etree.Element('a', name='%s' % prefix)
302         anchor_target.set('class', 'target')
303         anchor_target.text = u' '
304         parent.insert(index, anchor_target)
305
306
307 def any_ancestor(element, test):
308     for ancestor in element.iterancestors():
309         if test(ancestor):
310             return True
311     return False
312
313
314 def add_anchors(root):
315     counter = 1
316     visible_counter = 1
317     for element in root.iterdescendants():
318         def f(e):
319             return (
320                 e.get('class') in (
321                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
322                 )
323                 or e.get('id') == 'nota_red'
324                 or e.tag == 'blockquote'
325                 or e.get('id') == 'footnotes'
326             )
327
328         if element.get('class') == 'numeracja':
329             try:
330                 visible_counter = int(element.get('data-start'))
331             except ValueError:
332                 visible_counter = 1
333
334         if any_ancestor(element, f):
335             continue
336
337         if element.tag == 'div' and 'verse' in element.get('class', ''):
338             if visible_counter == 1 or visible_counter % 5 == 0:
339                 add_anchor(element, "f%d" % counter, link_text=visible_counter)
340             counter += 1
341             visible_counter += 1
342         elif 'paragraph' in element.get('class', ''):
343             add_anchor(element, "f%d" % counter, link_text=visible_counter)
344             counter += 1
345             visible_counter += 1
346
347
348 def raw_printable_text(element):
349     working = copy.deepcopy(element)
350     for e in working.findall('a'):
351         if e.get('class') in ('annotation', 'theme-begin'):
352             e.text = ''
353     return etree.tostring(working, method='text', encoding='unicode').strip()
354
355
356 def add_table_of_contents(root):
357     sections = []
358     counter = 1
359     for element in root.iterdescendants():
360         if element.tag in ('h2', 'h3'):
361             if any_ancestor(
362                     element,
363                     lambda e: e.get('id') in (
364                         'footnotes', 'nota_red'
365                     ) or e.get('class') in ('person-list',)):
366                 continue
367
368             element_text = raw_printable_text(element)
369             if (element.tag == 'h3' and len(sections)
370                     and sections[-1][1] == 'h2'):
371                 sections[-1][3].append(
372                     (counter, element.tag, element_text, [])
373                 )
374             else:
375                 sections.append((counter, element.tag, element_text, []))
376             add_anchor(element, "s%d" % counter, with_link=False)
377             counter += 1
378
379     toc = etree.Element('div')
380     toc.set('id', 'toc')
381     toc_header = etree.SubElement(toc, 'h2')
382     toc_header.text = u'Spis treści'
383     toc_list = etree.SubElement(toc, 'ol')
384
385     for n, section, text, subsections in sections:
386         section_element = etree.SubElement(toc_list, 'li')
387         add_anchor(section_element, "s%d" % n, with_target=False,
388                    link_text=text)
389
390         if len(subsections):
391             subsection_list = etree.SubElement(section_element, 'ol')
392             for n1, subsection, subtext, _ in subsections:
393                 subsection_element = etree.SubElement(subsection_list, 'li')
394                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
395                            link_text=subtext)
396
397     root.insert(0, toc)
398
399
400 def add_table_of_themes(root):
401     try:
402         from sortify import sortify
403     except ImportError:
404         def sortify(x):
405             return x
406
407     book_themes = {}
408     for fragment in root.findall('.//a[@class="theme-begin"]'):
409         if not fragment.text:
410             continue
411         theme_names = [s.strip() for s in fragment.text.split(',')]
412         for theme_name in theme_names:
413             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
414     book_themes = list(book_themes.items())
415     book_themes.sort(key=lambda s: sortify(s[0]))
416     themes_div = etree.Element('div', id="themes")
417     themes_ol = etree.SubElement(themes_div, 'ol')
418     for theme_name, fragments in book_themes:
419         themes_li = etree.SubElement(themes_ol, 'li')
420         themes_li.text = "%s: " % theme_name
421         for i, fragment in enumerate(fragments):
422             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
423             item.text = str(i + 1)
424             item.tail = ' '
425     root.insert(0, themes_div)
426
427
428 def extract_annotations(html_path):
429     """Extracts annotations from HTML for annotations dictionary.
430
431     For each annotation, yields a tuple of:
432     anchor, footnote type, valid qualifiers, text, html.
433
434     """
435     from .fn_qualifiers import FN_QUALIFIERS
436
437     parser = etree.HTMLParser(encoding='utf-8')
438     tree = etree.parse(html_path, parser)
439     footnotes = tree.find('//*[@id="footnotes"]')
440     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
441     if footnotes is not None:
442         for footnote in footnotes.findall('div'):
443             fn_type = footnote.get('class').split('-')[1]
444             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
445             del footnote[:2]
446             footnote.text = None
447             if len(footnote) and footnote[-1].tail == '\n':
448                 footnote[-1].tail = None
449             text_str = etree.tostring(footnote, method='text',
450                                       encoding='unicode').strip()
451             html_str = etree.tostring(footnote, method='html',
452                                       encoding='unicode').strip()
453
454             match = re_qualifier.match(text_str)
455             if match:
456                 qualifier_str = match.group(1)
457                 qualifiers = []
458                 for candidate in re.split('[;,]', qualifier_str):
459                     candidate = candidate.strip()
460                     if candidate in FN_QUALIFIERS:
461                         qualifiers.append(candidate)
462                     elif candidate.startswith('z '):
463                         subcandidate = candidate.split()[1]
464                         if subcandidate in FN_QUALIFIERS:
465                             qualifiers.append(subcandidate)
466             else:
467                 qualifiers = []
468
469             yield anchor, fn_type, qualifiers, text_str, html_str