Add Hebrew support in pdfs.
[librarian.git] / src / librarian / html.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import io
5 import os
6 import re
7 import copy
8 import urllib.parse
9 import urllib.request
10
11 from lxml import etree
12 from librarian import XHTMLNS, DCNS, ParseError, OutputFile
13 from librarian import functions
14 from PIL import Image
15
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
17
18
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
21
22 STYLESHEETS = {
23     'legacy': 'xslt/book2html.xslt',
24 }
25
26
27 def get_stylesheet(name):
28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
29
30
31 def html_has_content(text):
32     return etree.ETXPath(
33         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
34     )(text)
35
36
37 def transform_abstrakt(abstrakt_element):
38     style_filename = get_stylesheet('legacy')
39     style = etree.parse(style_filename)
40     xml = etree.tostring(abstrakt_element, encoding='unicode')
41     document = etree.parse(io.StringIO(
42         xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
43     ))  # HACK
44     result = document.xslt(style)
45     html = re.sub('<a name="sec[0-9]*"/>', '',
46                   etree.tostring(result, encoding='unicode'))
47     return re.sub('</?blockquote[^>]*>', '', html)
48
49
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51     widths = [360, 600, 1200, 1800, 2400]
52     try:
53         os.makedirs(gallery_path)
54     except:
55         pass
56
57     for i, ilustr in enumerate(tree.findall('//ilustr')):
58         rel_path = ilustr.attrib['src']
59         img_url = urllib.parse.urljoin(base_url, rel_path)
60
61         f = urllib.request.urlopen(img_url)
62         img = Image.open(f)
63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
64
65         srcset = []
66         # Needed widths: predefined and original, limited by
67         # whichever is smaller.
68         img_widths = [
69             w for w in
70             sorted(
71                 set(widths + [img.size[0]])
72             )
73             if w <= min(widths[-1], img.size[0])
74         ]
75         largest = None
76         for w in widths:
77             fname = '%d.W%d.%s' % (i, w, ext)
78             fpath = gallery_path + fname
79             if not os.path.exists(fpath):
80                 height = round(img.size[1] * w / img.size[0])
81                 th = img.resize((w, height))
82                 th.save(fpath)
83             th_url = gallery_url + fname
84             srcset.append(" ".join((
85                 th_url,
86                 '%dw' % w
87             )))
88             largest_url = th_url
89         ilustr.attrib['srcset'] = ", ".join(srcset)
90         ilustr.attrib['src'] = largest_url
91
92         f.close()
93
94
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96     """Transforms the WL document to XHTML.
97
98     If output_filename is None, returns an XML,
99     otherwise returns True if file has been written,False if it hasn't.
100     File won't be written if it has no content.
101     """
102     # Parse XSLT
103     try:
104         style_filename = get_stylesheet(stylesheet)
105         style = etree.parse(style_filename)
106
107         document = copy.deepcopy(wldoc)
108         del wldoc
109         document.swap_endlines()
110
111         if flags:
112             for flag in flags:
113                 document.edoc.getroot().set(flag, 'yes')
114
115         ltag = document.edoc.find('//' + DCNS('language'))
116         lang = functions.lang_code_3to2(ltag.text) or 'pl'
117         document.edoc.getroot().set('lang', lang)
118
119         document.clean_ed_note()
120         document.clean_ed_note('abstrakt')
121         document.fix_pa_akap()
122         document.hebr_protect()
123         
124         if not options:
125             options = {}
126
127         try:
128             os.makedirs(gallery_path)
129         except OSError:
130             pass
131
132         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
133
134         css = (
135             css
136             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
137         )
138         css = "'%s'" % css
139         result = document.transform(style, css=css, **options)
140         del document  # no longer needed large object :)
141
142         if html_has_content(result):
143             add_anchors(result.getroot())
144             add_table_of_themes(result.getroot())
145             add_table_of_contents(result.getroot())
146
147             return OutputFile.from_bytes(etree.tostring(
148                 result, method='html', xml_declaration=False,
149                 pretty_print=True, encoding='utf-8'
150             ))
151         else:
152             return None
153     except KeyError:
154         raise ValueError("'%s' is not a valid stylesheet.")
155     except (XMLSyntaxError, XSLTApplyError) as e:
156         raise ParseError(e)
157
158
159 class Fragment:
160     def __init__(self, id, themes):
161         super(Fragment, self).__init__()
162         self.id = id
163         self.themes = themes
164         self.events = []
165
166     def append(self, event, element):
167         self.events.append((event, element))
168
169     def closed_events(self):
170         stack = []
171         for event, element in self.events:
172             if event == 'start':
173                 stack.append(('end', element))
174             elif event == 'end':
175                 try:
176                     stack.pop()
177                 except IndexError:
178                     print('CLOSED NON-OPEN TAG:', element)
179
180         stack.reverse()
181         return self.events + stack
182
183     def to_string(self):
184         result = []
185         for event, element in self.closed_events():
186             if event == 'start':
187                 result.append('<%s %s>' % (
188                     element.tag,
189                     ' '.join(
190                         '%s="%s"' % (k, v)
191                         for k, v in element.attrib.items()
192                     )
193                 ))
194                 if element.text:
195                     result.append(element.text)
196             elif event == 'end':
197                 result.append('</%s>' % element.tag)
198                 if element.tail:
199                     result.append(element.tail)
200             else:
201                 result.append(element)
202
203         return ''.join(result)
204
205     def __str__(self):
206         return self.to_string()
207
208
209 def extract_fragments(input_filename):
210     """Extracts theme fragments from input_filename."""
211     open_fragments = {}
212     closed_fragments = {}
213
214     # iterparse would die on a HTML document
215     parser = etree.HTMLParser(encoding='utf-8')
216     buf = io.BytesIO()
217     buf.write(etree.tostring(
218         etree.parse(input_filename, parser).getroot()[0][0],
219         encoding='utf-8'
220     ))
221     buf.seek(0)
222
223     for event, element in etree.iterparse(buf, events=('start', 'end')):
224         # Process begin and end elements
225         if element.get('class', '') in ('theme-begin', 'theme-end'):
226             if not event == 'end':
227                 continue  # Process elements only once, on end event
228
229             # Open new fragment
230             if element.get('class', '') == 'theme-begin':
231                 fragment = Fragment(id=element.get('fid'), themes=element.text)
232
233                 # Append parents
234                 parent = element.getparent()
235                 parents = []
236                 while parent.get('id', None) != 'book-text':
237                     cparent = copy.deepcopy(parent)
238                     cparent.text = None
239                     if 'id' in cparent.attrib:
240                         del cparent.attrib['id']
241                     parents.append(cparent)
242                     parent = parent.getparent()
243
244                 parents.reverse()
245                 for parent in parents:
246                     fragment.append('start', parent)
247
248                 if fragment.id not in open_fragments:
249                     open_fragments[fragment.id] = fragment
250
251             # Close existing fragment
252             else:
253                 try:
254                     fragment = open_fragments[element.get('fid')]
255                 except KeyError:
256                     print('%s:closed not open fragment #%s' % (
257                         input_filename, element.get('fid')
258                     ))
259                 else:
260                     closed_fragments[fragment.id] = fragment
261                     del open_fragments[fragment.id]
262
263             # Append element tail to lost_text
264             # (we don't want to lose any text)
265             if element.tail:
266                 for fragment_id in open_fragments:
267                     open_fragments[fragment_id].append('text', element.tail)
268
269         # Process all elements except begin and end
270         else:
271             # Omit annotation tags
272             if (len(element.get('name', '')) or
273                     element.get('class', '') in ('annotation', 'anchor')):
274                 if event == 'end' and element.tail:
275                     for fragment_id in open_fragments:
276                         open_fragments[fragment_id].append(
277                             'text', element.tail
278                         )
279             else:
280                 for fragment_id in open_fragments:
281                     celem = copy.copy(element)
282                     if 'id' in celem.attrib:
283                         del celem.attrib['id']
284                     open_fragments[fragment_id].append(
285                         event, celem
286                     )
287
288     return closed_fragments, open_fragments
289
290
291 def add_anchor(element, prefix, with_link=True, with_target=True,
292                link_text=None):
293     parent = element.getparent()
294     index = parent.index(element)
295
296     if with_link:
297         if link_text is None:
298             link_text = prefix
299         anchor = etree.Element('a', href='#%s' % prefix)
300         anchor.set('class', 'anchor')
301         anchor.text = str(link_text)
302         parent.insert(index, anchor)
303
304     if with_target:
305         anchor_target = etree.Element('a', name='%s' % prefix)
306         anchor_target.set('class', 'target')
307         anchor_target.text = ' '
308         parent.insert(index, anchor_target)
309
310
311 def any_ancestor(element, test):
312     for ancestor in element.iterancestors():
313         if test(ancestor):
314             return True
315     return False
316
317
318 def add_anchors(root):
319     link_prefix = "f"
320     counter = {"f": 1}
321     visible_counter = 1
322     for element in root.iterdescendants():
323         def f(e):
324             return (
325                 e.get('class') in (
326                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
327                 )
328                 or e.get('id') == 'nota_red'
329                 or e.tag == 'blockquote'
330                 or e.get('id') == 'footnotes'
331             )
332
333         if element.get('class') == 'numeracja':
334             try:
335                 visible_counter = int(element.get('data-start'))
336             except ValueError:
337                 visible_counter = 1
338             if element.get("data-link"):
339                 link_prefix = element.get("data-link")
340                 counter[link_prefix] = 1
341
342         if any_ancestor(element, f):
343             continue
344
345         if element.tag == 'div' and 'verse' in element.get('class', ''):
346             if visible_counter == 1 or visible_counter % 5 == 0:
347                 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
348             counter[link_prefix] += 1
349             visible_counter += 1
350         elif 'paragraph' in element.get('class', ''):
351             add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
352             counter[link_prefix] += 1
353             visible_counter += 1
354
355
356 def raw_printable_text(element):
357     working = copy.deepcopy(element)
358     for e in working.findall('a'):
359         if e.get('class') in ('annotation', 'theme-begin'):
360             e.text = ''
361     return etree.tostring(working, method='text', encoding='unicode').strip()
362
363
364 def add_table_of_contents(root):
365     sections = []
366     counter = 1
367     for element in root.iterdescendants():
368         if element.tag in ('h2', 'h3'):
369             if any_ancestor(
370                     element,
371                     lambda e: e.get('id') in (
372                         'footnotes', 'nota_red'
373                     ) or e.get('class') in ('person-list',)):
374                 continue
375
376             element_text = raw_printable_text(element)
377             if (element.tag == 'h3' and len(sections)
378                     and sections[-1][1] == 'h2'):
379                 sections[-1][3].append(
380                     (counter, element.tag, element_text, [])
381                 )
382             else:
383                 sections.append((counter, element.tag, element_text, []))
384             add_anchor(element, "s%d" % counter, with_link=False)
385             counter += 1
386
387     toc = etree.Element('div')
388     toc.set('id', 'toc')
389     toc_header = etree.SubElement(toc, 'h2')
390     toc_header.text = 'Spis treści'
391     toc_list = etree.SubElement(toc, 'ol')
392
393     for n, section, text, subsections in sections:
394         section_element = etree.SubElement(toc_list, 'li')
395         add_anchor(section_element, "s%d" % n, with_target=False,
396                    link_text=text)
397
398         if len(subsections):
399             subsection_list = etree.SubElement(section_element, 'ol')
400             for n1, subsection, subtext, _ in subsections:
401                 subsection_element = etree.SubElement(subsection_list, 'li')
402                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
403                            link_text=subtext)
404
405     root.insert(0, toc)
406
407
408 def add_table_of_themes(root):
409     try:
410         from sortify import sortify
411     except ImportError:
412         def sortify(x):
413             return x
414
415     book_themes = {}
416     for fragment in root.findall('.//a[@class="theme-begin"]'):
417         if not fragment.text:
418             continue
419         theme_names = [s.strip() for s in fragment.text.split(',')]
420         for theme_name in theme_names:
421             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
422     book_themes = list(book_themes.items())
423     book_themes.sort(key=lambda s: sortify(s[0]))
424     themes_div = etree.Element('div', id="themes")
425     themes_ol = etree.SubElement(themes_div, 'ol')
426     for theme_name, fragments in book_themes:
427         themes_li = etree.SubElement(themes_ol, 'li')
428         themes_li.text = "%s: " % theme_name
429         for i, fragment in enumerate(fragments):
430             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
431             item.text = str(i + 1)
432             item.tail = ' '
433     root.insert(0, themes_div)
434
435
436 def extract_annotations(html_path):
437     """Extracts annotations from HTML for annotations dictionary.
438
439     For each annotation, yields a tuple of:
440     anchor, footnote type, valid qualifiers, text, html.
441
442     """
443     from .fn_qualifiers import FN_QUALIFIERS
444
445     parser = etree.HTMLParser(encoding='utf-8')
446     tree = etree.parse(html_path, parser)
447     footnotes = tree.find('//*[@id="footnotes"]')
448     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
449     if footnotes is not None:
450         for footnote in footnotes.findall('div'):
451             fn_type = footnote.get('class').split('-')[1]
452             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
453             del footnote[:2]
454             footnote.text = None
455             if len(footnote) and footnote[-1].tail == '\n':
456                 footnote[-1].tail = None
457             text_str = etree.tostring(footnote, method='text',
458                                       encoding='unicode').strip()
459             html_str = etree.tostring(footnote, method='html',
460                                       encoding='unicode').strip()
461
462             match = re_qualifier.match(text_str)
463             if match:
464                 qualifier_str = match.group(1)
465                 qualifiers = []
466                 for candidate in re.split('[;,]', qualifier_str):
467                     candidate = candidate.strip()
468                     if candidate in FN_QUALIFIERS:
469                         qualifiers.append(candidate)
470                     elif candidate.startswith('z '):
471                         subcandidate = candidate.split()[1]
472                         if subcandidate in FN_QUALIFIERS:
473                             qualifiers.append(subcandidate)
474             else:
475                 qualifiers = []
476
477             yield anchor, fn_type, qualifiers, text_str, html_str