Fixes rotated image
[librarian.git] / src / librarian / html.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 import io
5 import os
6 import re
7 import copy
8 import urllib.parse
9 import urllib.request
10
11 from lxml import etree
12 from librarian import XHTMLNS, DCNS, ParseError, OutputFile
13 from librarian import functions
14 from PIL import Image
15
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
17
18
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
21
22 STYLESHEETS = {
23     'legacy': 'xslt/book2html.xslt',
24 }
25
26
27 def get_stylesheet(name):
28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
29
30
31 def html_has_content(text):
32     return etree.ETXPath(
33         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
34     )(text)
35
36
37 def transform_abstrakt(abstrakt_element):
38     style_filename = get_stylesheet('legacy')
39     style = etree.parse(style_filename)
40     xml = etree.tostring(abstrakt_element, encoding='unicode')
41     document = etree.parse(io.StringIO(
42         xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
43     ))  # HACK
44     result = document.xslt(style)
45     html = re.sub('<a name="sec[0-9]*"/>', '',
46                   etree.tostring(result, encoding='unicode'))
47     return re.sub('</?blockquote[^>]*>', '', html)
48
49
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51     widths = [360, 600, 1200, 1800, 2400]
52     try:
53         os.makedirs(gallery_path)
54     except:
55         pass
56
57     for i, ilustr in enumerate(tree.findall('//ilustr')):
58         rel_path = ilustr.attrib['src']
59         img_url = urllib.parse.urljoin(base_url, rel_path)
60
61         f = urllib.request.urlopen(img_url)
62         img = Image.open(f)
63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
64
65         srcset = []
66         # Needed widths: predefined and original, limited by
67         # whichever is smaller.
68         img_widths = [
69             w for w in
70             sorted(
71                 set(widths + [img.size[0]])
72             )
73             if w <= min(widths[-1], img.size[0])
74         ]
75         largest = None
76         for w in widths:
77             fname = '%d.W%d.%s' % (i, w, ext)
78             fpath = gallery_path + fname
79             if not os.path.exists(fpath):
80                 height = round(img.size[1] * w / img.size[0])
81                 th = img.resize((w, height))
82                 th.save(fpath)
83             th_url = gallery_url + fname
84             srcset.append(" ".join((
85                 th_url,
86                 '%dw' % w
87             )))
88             largest_url = th_url
89         ilustr.attrib['srcset'] = ", ".join(srcset)
90         ilustr.attrib['src'] = largest_url
91
92         f.close()
93
94
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96     """Transforms the WL document to XHTML.
97
98     If output_filename is None, returns an XML,
99     otherwise returns True if file has been written,False if it hasn't.
100     File won't be written if it has no content.
101     """
102     # Parse XSLT
103     try:
104         style_filename = get_stylesheet(stylesheet)
105         style = etree.parse(style_filename)
106
107         document = copy.deepcopy(wldoc)
108         del wldoc
109         document.swap_endlines()
110
111         if flags:
112             for flag in flags:
113                 document.edoc.getroot().set(flag, 'yes')
114
115         ltag = document.edoc.find('//' + DCNS('language'))
116         if ltag is not None:
117             lang = functions.lang_code_3to2(ltag.text)
118         else:
119             lang = 'pl'
120         document.edoc.getroot().set('lang', lang)
121
122         document.clean_ed_note()
123         document.clean_ed_note('abstrakt')
124         document.fix_pa_akap()
125         document.hebr_protect()
126         
127         if not options:
128             options = {}
129
130         try:
131             os.makedirs(gallery_path)
132         except OSError:
133             pass
134
135         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
136
137         css = (
138             css
139             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
140         )
141         css = "'%s'" % css
142         result = document.transform(style, css=css, **options)
143         del document  # no longer needed large object :)
144
145         if html_has_content(result):
146             add_anchors(result.getroot())
147             add_table_of_themes(result.getroot())
148             add_table_of_contents(result.getroot())
149
150             return OutputFile.from_bytes(etree.tostring(
151                 result, method='html', xml_declaration=False,
152                 pretty_print=True, encoding='utf-8'
153             ))
154         else:
155             return None
156     except KeyError:
157         raise ValueError("'%s' is not a valid stylesheet.")
158     except (XMLSyntaxError, XSLTApplyError) as e:
159         raise ParseError(e)
160
161
162 class Fragment:
163     def __init__(self, id, themes):
164         super(Fragment, self).__init__()
165         self.id = id
166         self.themes = themes
167         self.events = []
168
169     def append(self, event, element):
170         self.events.append((event, element))
171
172     def closed_events(self):
173         stack = []
174         for event, element in self.events:
175             if event == 'start':
176                 stack.append(('end', element))
177             elif event == 'end':
178                 try:
179                     stack.pop()
180                 except IndexError:
181                     print('CLOSED NON-OPEN TAG:', element)
182
183         stack.reverse()
184         return self.events + stack
185
186     def to_string(self):
187         result = []
188         for event, element in self.closed_events():
189             if event == 'start':
190                 result.append('<%s %s>' % (
191                     element.tag,
192                     ' '.join(
193                         '%s="%s"' % (k, v)
194                         for k, v in element.attrib.items()
195                     )
196                 ))
197                 if element.text:
198                     result.append(element.text)
199             elif event == 'end':
200                 result.append('</%s>' % element.tag)
201                 if element.tail:
202                     result.append(element.tail)
203             else:
204                 result.append(element)
205
206         return ''.join(result)
207
208     def __str__(self):
209         return self.to_string()
210
211
212 def extract_fragments(input_filename):
213     """Extracts theme fragments from input_filename."""
214     open_fragments = {}
215     closed_fragments = {}
216
217     # iterparse would die on a HTML document
218     parser = etree.HTMLParser(encoding='utf-8')
219     buf = io.BytesIO()
220     buf.write(etree.tostring(
221         etree.parse(input_filename, parser).getroot()[0][0],
222         encoding='utf-8'
223     ))
224     buf.seek(0)
225
226     for event, element in etree.iterparse(buf, events=('start', 'end')):
227         # Process begin and end elements
228         if element.get('class', '') in ('theme-begin', 'theme-end'):
229             if not event == 'end':
230                 continue  # Process elements only once, on end event
231
232             # Open new fragment
233             if element.get('class', '') == 'theme-begin':
234                 fragment = Fragment(id=element.get('fid'), themes=element.text)
235
236                 # Append parents
237                 parent = element.getparent()
238                 parents = []
239                 while parent.get('id', None) != 'book-text':
240                     cparent = copy.deepcopy(parent)
241                     cparent.text = None
242                     if 'id' in cparent.attrib:
243                         del cparent.attrib['id']
244                     parents.append(cparent)
245                     parent = parent.getparent()
246
247                 parents.reverse()
248                 for parent in parents:
249                     fragment.append('start', parent)
250
251                 if fragment.id not in open_fragments:
252                     open_fragments[fragment.id] = fragment
253
254             # Close existing fragment
255             else:
256                 try:
257                     fragment = open_fragments[element.get('fid')]
258                 except KeyError:
259                     print('%s:closed not open fragment #%s' % (
260                         input_filename, element.get('fid')
261                     ))
262                 else:
263                     closed_fragments[fragment.id] = fragment
264                     del open_fragments[fragment.id]
265
266             # Append element tail to lost_text
267             # (we don't want to lose any text)
268             if element.tail:
269                 for fragment_id in open_fragments:
270                     open_fragments[fragment_id].append('text', element.tail)
271
272         # Process all elements except begin and end
273         else:
274             # Omit annotation tags
275             if (len(element.get('name', '')) or
276                     element.get('class', '') in ('annotation', 'anchor')):
277                 if event == 'end' and element.tail:
278                     for fragment_id in open_fragments:
279                         open_fragments[fragment_id].append(
280                             'text', element.tail
281                         )
282             else:
283                 for fragment_id in open_fragments:
284                     celem = copy.copy(element)
285                     if 'id' in celem.attrib:
286                         del celem.attrib['id']
287                     open_fragments[fragment_id].append(
288                         event, celem
289                     )
290
291     return closed_fragments, open_fragments
292
293
294 def add_anchor(element, prefix, with_link=True, with_target=True,
295                link_text=None):
296     parent = element.getparent()
297     index = parent.index(element)
298
299     if with_link:
300         if link_text is None:
301             link_text = prefix
302         anchor = etree.Element('a', href='#%s' % prefix)
303         anchor.set('class', 'anchor')
304         anchor.text = str(link_text)
305         parent.insert(index, anchor)
306
307     if with_target:
308         anchor_target = etree.Element('a', name='%s' % prefix)
309         anchor_target.set('class', 'target')
310         anchor_target.text = ' '
311         parent.insert(index, anchor_target)
312
313
314 def any_ancestor(element, test):
315     for ancestor in element.iterancestors():
316         if test(ancestor):
317             return True
318     return False
319
320
321 def add_anchors(root):
322     link_prefix = "f"
323     counter = {"f": 1}
324     visible_counter = 1
325     for element in root.iterdescendants():
326         def f(e):
327             return (
328                 e.get('class') in (
329                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
330                 )
331                 or e.get('id') == 'nota_red'
332                 or e.tag == 'blockquote'
333                 or e.get('id') == 'footnotes'
334             )
335
336         if element.get('class') == 'numeracja':
337             try:
338                 visible_counter = int(element.get('data-start'))
339             except ValueError:
340                 visible_counter = 1
341             if element.get("data-link"):
342                 link_prefix = element.get("data-link")
343                 counter[link_prefix] = 1
344
345         if any_ancestor(element, f):
346             continue
347
348         if element.tag == 'div' and 'verse' in element.get('class', ''):
349             if visible_counter == 1 or visible_counter % 5 == 0:
350                 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
351             counter[link_prefix] += 1
352             visible_counter += 1
353         elif 'paragraph' in element.get('class', ''):
354             add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
355             counter[link_prefix] += 1
356             visible_counter += 1
357
358
359 def raw_printable_text(element):
360     working = copy.deepcopy(element)
361     for e in working.findall('a'):
362         if e.get('class') in ('annotation', 'theme-begin'):
363             e.text = ''
364     return etree.tostring(working, method='text', encoding='unicode').strip()
365
366
367 def add_table_of_contents(root):
368     sections = []
369     counter = 1
370     for element in root.iterdescendants():
371         if element.tag in ('h2', 'h3'):
372             if any_ancestor(
373                     element,
374                     lambda e: e.get('id') in (
375                         'footnotes', 'nota_red'
376                     ) or e.get('class') in ('person-list',)):
377                 continue
378
379             element_text = raw_printable_text(element)
380             if (element.tag == 'h3' and len(sections)
381                     and sections[-1][1] == 'h2'):
382                 sections[-1][3].append(
383                     (counter, element.tag, element_text, [])
384                 )
385             else:
386                 sections.append((counter, element.tag, element_text, []))
387             add_anchor(element, "s%d" % counter, with_link=False)
388             counter += 1
389
390     toc = etree.Element('div')
391     toc.set('id', 'toc')
392     toc_header = etree.SubElement(toc, 'h2')
393     toc_header.text = 'Spis treści'
394     toc_list = etree.SubElement(toc, 'ol')
395
396     for n, section, text, subsections in sections:
397         section_element = etree.SubElement(toc_list, 'li')
398         add_anchor(section_element, "s%d" % n, with_target=False,
399                    link_text=text)
400
401         if len(subsections):
402             subsection_list = etree.SubElement(section_element, 'ol')
403             for n1, subsection, subtext, _ in subsections:
404                 subsection_element = etree.SubElement(subsection_list, 'li')
405                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
406                            link_text=subtext)
407
408     root.insert(0, toc)
409
410
411 def add_table_of_themes(root):
412     try:
413         from sortify import sortify
414     except ImportError:
415         def sortify(x):
416             return x
417
418     book_themes = {}
419     for fragment in root.findall('.//a[@class="theme-begin"]'):
420         if not fragment.text:
421             continue
422         theme_names = [s.strip() for s in fragment.text.split(',')]
423         for theme_name in theme_names:
424             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
425     book_themes = list(book_themes.items())
426     book_themes.sort(key=lambda s: sortify(s[0]))
427     themes_div = etree.Element('div', id="themes")
428     themes_ol = etree.SubElement(themes_div, 'ol')
429     for theme_name, fragments in book_themes:
430         themes_li = etree.SubElement(themes_ol, 'li')
431         themes_li.text = "%s: " % theme_name
432         for i, fragment in enumerate(fragments):
433             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
434             item.text = str(i + 1)
435             item.tail = ' '
436     root.insert(0, themes_div)
437
438
439 def extract_annotations(html_path):
440     """Extracts annotations from HTML for annotations dictionary.
441
442     For each annotation, yields a tuple of:
443     anchor, footnote type, valid qualifiers, text, html.
444
445     """
446     from .fn_qualifiers import FN_QUALIFIERS
447
448     parser = etree.HTMLParser(encoding='utf-8')
449     tree = etree.parse(html_path, parser)
450     footnotes = tree.find('//*[@id="footnotes"]')
451     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
452     if footnotes is not None:
453         for footnote in footnotes.findall('div'):
454             fn_type = footnote.get('class').split('-')[1]
455             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
456             del footnote[:2]
457             footnote.text = None
458             if len(footnote) and footnote[-1].tail == '\n':
459                 footnote[-1].tail = None
460             text_str = etree.tostring(footnote, method='text',
461                                       encoding='unicode').strip()
462             html_str = etree.tostring(footnote, method='html',
463                                       encoding='unicode').strip()
464
465             match = re_qualifier.match(text_str)
466             if match:
467                 qualifier_str = match.group(1)
468                 qualifiers = []
469                 for candidate in re.split('[;,]', qualifier_str):
470                     candidate = candidate.strip()
471                     if candidate in FN_QUALIFIERS:
472                         qualifiers.append(candidate)
473                     elif candidate.startswith('z '):
474                         subcandidate = candidate.split()[1]
475                         if subcandidate in FN_QUALIFIERS:
476                             qualifiers.append(subcandidate)
477             else:
478                 qualifiers = []
479
480             yield anchor, fn_type, qualifiers, text_str, html_str