Epub: fixes #4164, #4166
[librarian.git] / src / librarian / html.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import print_function, unicode_literals
7
8 import os
9 import re
10 import copy
11
12 from lxml import etree
13 from librarian import XHTMLNS, ParseError, OutputFile
14 from librarian import functions
15 from PIL import Image
16
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
18 import six
19
20
21 functions.reg_substitute_entities()
22 functions.reg_person_name()
23
24 STYLESHEETS = {
25     'legacy': 'xslt/book2html.xslt',
26     'full': 'xslt/wl2html_full.xslt',
27     'partial': 'xslt/wl2html_partial.xslt'
28 }
29
30
31 def get_stylesheet(name):
32     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
33
34
35 def html_has_content(text):
36     return etree.ETXPath(
37         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
38     )(text)
39
40
41 def transform_abstrakt(abstrakt_element):
42     style_filename = get_stylesheet('legacy')
43     style = etree.parse(style_filename)
44     xml = etree.tostring(abstrakt_element, encoding='unicode')
45     document = etree.parse(six.StringIO(
46         xml.replace('abstrakt', 'dlugi_cytat')
47     ))  # HACK
48     result = document.xslt(style)
49     html = re.sub('<a name="sec[0-9]*"/>', '',
50                   etree.tostring(result, encoding='unicode'))
51     return re.sub('</?blockquote[^>]*>', '', html)
52
53
54 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
55     widths = [360, 600, 1200, 1800, 2400]
56
57     for i, ilustr in enumerate(tree.findall('//ilustr')):
58         rel_path = ilustr.attrib['src']
59         img_url = six.moves.urllib.parse.urljoin(base_url, rel_path)
60
61         with six.moves.urllib.request.urlopen(img_url) as f:
62             img = Image.open(f)
63
64         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
65
66         srcset = []
67         # Needed widths: predefined and original, limited by
68         # whichever is smaller.
69         img_widths = [
70             w for w in
71             sorted(
72                 set(widths + [img.size[0]])
73             )
74             if w <= min(widths[-1], img.size[0])
75         ]
76         largest = None
77         for w in widths:
78             height = round(img.size[1] * w / img.size[0])
79             th = img.resize((w, height))
80             fname = '%d.W%d.%s' % (i, w, ext)
81             th.save(gallery_path + fname)
82             th_url = gallery_url + fname
83             srcset.append(" ".join((
84                 th_url,
85                 '%dw' % w
86             )))
87             largest_url = th_url
88         ilustr.attrib['srcset'] = ", ".join(srcset)
89         ilustr.attrib['src'] = largest_url
90
91
92 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
93     """Transforms the WL document to XHTML.
94
95     If output_filename is None, returns an XML,
96     otherwise returns True if file has been written,False if it hasn't.
97     File won't be written if it has no content.
98     """
99     # Parse XSLT
100     try:
101         style_filename = get_stylesheet(stylesheet)
102         style = etree.parse(style_filename)
103
104         document = copy.deepcopy(wldoc)
105         del wldoc
106         document.swap_endlines()
107
108         if flags:
109             for flag in flags:
110                 document.edoc.getroot().set(flag, 'yes')
111
112         document.clean_ed_note()
113         document.clean_ed_note('abstrakt')
114
115         if not options:
116             options = {}
117
118         try:
119             os.makedirs(gallery_path)
120         except OSError:
121             pass
122
123         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
124
125         css = (
126             css
127             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
128         )
129         css = "'%s'" % css
130         result = document.transform(style, css=css, **options)
131         del document  # no longer needed large object :)
132
133         if html_has_content(result):
134             add_anchors(result.getroot())
135             add_table_of_themes(result.getroot())
136             add_table_of_contents(result.getroot())
137
138             return OutputFile.from_bytes(etree.tostring(
139                 result, method='html', xml_declaration=False,
140                 pretty_print=True, encoding='utf-8'
141             ))
142         else:
143             return None
144     except KeyError:
145         raise ValueError("'%s' is not a valid stylesheet.")
146     except (XMLSyntaxError, XSLTApplyError) as e:
147         raise ParseError(e)
148
149
150 @six.python_2_unicode_compatible
151 class Fragment(object):
152     def __init__(self, id, themes):
153         super(Fragment, self).__init__()
154         self.id = id
155         self.themes = themes
156         self.events = []
157
158     def append(self, event, element):
159         self.events.append((event, element))
160
161     def closed_events(self):
162         stack = []
163         for event, element in self.events:
164             if event == 'start':
165                 stack.append(('end', element))
166             elif event == 'end':
167                 try:
168                     stack.pop()
169                 except IndexError:
170                     print('CLOSED NON-OPEN TAG:', element)
171
172         stack.reverse()
173         return self.events + stack
174
175     def to_string(self):
176         result = []
177         for event, element in self.closed_events():
178             if event == 'start':
179                 result.append(u'<%s %s>' % (
180                     element.tag,
181                     ' '.join(
182                         '%s="%s"' % (k, v)
183                         for k, v in element.attrib.items()
184                     )
185                 ))
186                 if element.text:
187                     result.append(element.text)
188             elif event == 'end':
189                 result.append(u'</%s>' % element.tag)
190                 if element.tail:
191                     result.append(element.tail)
192             else:
193                 result.append(element)
194
195         return ''.join(result)
196
197     def __str__(self):
198         return self.to_string()
199
200
201 def extract_fragments(input_filename):
202     """Extracts theme fragments from input_filename."""
203     open_fragments = {}
204     closed_fragments = {}
205
206     # iterparse would die on a HTML document
207     parser = etree.HTMLParser(encoding='utf-8')
208     buf = six.BytesIO()
209     buf.write(etree.tostring(
210         etree.parse(input_filename, parser).getroot()[0][0],
211         encoding='utf-8'
212     ))
213     buf.seek(0)
214
215     for event, element in etree.iterparse(buf, events=('start', 'end')):
216         # Process begin and end elements
217         if element.get('class', '') in ('theme-begin', 'theme-end'):
218             if not event == 'end':
219                 continue  # Process elements only once, on end event
220
221             # Open new fragment
222             if element.get('class', '') == 'theme-begin':
223                 fragment = Fragment(id=element.get('fid'), themes=element.text)
224
225                 # Append parents
226                 parent = element.getparent()
227                 parents = []
228                 while parent.get('id', None) != 'book-text':
229                     cparent = copy.deepcopy(parent)
230                     cparent.text = None
231                     if 'id' in cparent.attrib:
232                         del cparent.attrib['id']
233                     parents.append(cparent)
234                     parent = parent.getparent()
235
236                 parents.reverse()
237                 for parent in parents:
238                     fragment.append('start', parent)
239
240                 open_fragments[fragment.id] = fragment
241
242             # Close existing fragment
243             else:
244                 try:
245                     fragment = open_fragments[element.get('fid')]
246                 except KeyError:
247                     print('%s:closed not open fragment #%s' % (
248                         input_filename, element.get('fid')
249                     ))
250                 else:
251                     closed_fragments[fragment.id] = fragment
252                     del open_fragments[fragment.id]
253
254             # Append element tail to lost_text
255             # (we don't want to lose any text)
256             if element.tail:
257                 for fragment_id in open_fragments:
258                     open_fragments[fragment_id].append('text', element.tail)
259
260         # Process all elements except begin and end
261         else:
262             # Omit annotation tags
263             if (len(element.get('name', '')) or
264                     element.get('class', '') in ('annotation', 'anchor')):
265                 if event == 'end' and element.tail:
266                     for fragment_id in open_fragments:
267                         open_fragments[fragment_id].append(
268                             'text', element.tail
269                         )
270             else:
271                 for fragment_id in open_fragments:
272                     celem = copy.copy(element)
273                     if 'id' in celem.attrib:
274                         del celem.attrib['id']
275                     open_fragments[fragment_id].append(
276                         event, celem
277                     )
278
279     return closed_fragments, open_fragments
280
281
282 def add_anchor(element, prefix, with_link=True, with_target=True,
283                link_text=None):
284     parent = element.getparent()
285     index = parent.index(element)
286
287     if with_link:
288         if link_text is None:
289             link_text = prefix
290         anchor = etree.Element('a', href='#%s' % prefix)
291         anchor.set('class', 'anchor')
292         anchor.text = six.text_type(link_text)
293         parent.insert(index, anchor)
294
295     if with_target:
296         anchor_target = etree.Element('a', name='%s' % prefix)
297         anchor_target.set('class', 'target')
298         anchor_target.text = u' '
299         parent.insert(index, anchor_target)
300
301
302 def any_ancestor(element, test):
303     for ancestor in element.iterancestors():
304         if test(ancestor):
305             return True
306     return False
307
308
309 def add_anchors(root):
310     counter = 1
311     for element in root.iterdescendants():
312         def f(e):
313             return (
314                 e.get('class') in (
315                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
316                 )
317                 or e.get('id') == 'nota_red'
318                 or e.tag == 'blockquote'
319             )
320         if any_ancestor(element, f):
321             continue
322
323         if element.tag == 'div' and 'verse' in element.get('class', ''):
324             if counter == 1 or counter % 5 == 0:
325                 add_anchor(element, "f%d" % counter, link_text=counter)
326             counter += 1
327         elif 'paragraph' in element.get('class', ''):
328             add_anchor(element, "f%d" % counter, link_text=counter)
329             counter += 1
330
331
332 def raw_printable_text(element):
333     working = copy.deepcopy(element)
334     for e in working.findall('a'):
335         if e.get('class') in ('annotation', 'theme-begin'):
336             e.text = ''
337     return etree.tostring(working, method='text', encoding='unicode').strip()
338
339
340 def add_table_of_contents(root):
341     sections = []
342     counter = 1
343     for element in root.iterdescendants():
344         if element.tag in ('h2', 'h3'):
345             if any_ancestor(
346                     element,
347                     lambda e: e.get('id') in (
348                         'footnotes', 'nota_red'
349                     ) or e.get('class') in ('person-list',)):
350                 continue
351
352             element_text = raw_printable_text(element)
353             if (element.tag == 'h3' and len(sections)
354                     and sections[-1][1] == 'h2'):
355                 sections[-1][3].append(
356                     (counter, element.tag, element_text, [])
357                 )
358             else:
359                 sections.append((counter, element.tag, element_text, []))
360             add_anchor(element, "s%d" % counter, with_link=False)
361             counter += 1
362
363     toc = etree.Element('div')
364     toc.set('id', 'toc')
365     toc_header = etree.SubElement(toc, 'h2')
366     toc_header.text = u'Spis treści'
367     toc_list = etree.SubElement(toc, 'ol')
368
369     for n, section, text, subsections in sections:
370         section_element = etree.SubElement(toc_list, 'li')
371         add_anchor(section_element, "s%d" % n, with_target=False,
372                    link_text=text)
373
374         if len(subsections):
375             subsection_list = etree.SubElement(section_element, 'ol')
376             for n1, subsection, subtext, _ in subsections:
377                 subsection_element = etree.SubElement(subsection_list, 'li')
378                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
379                            link_text=subtext)
380
381     root.insert(0, toc)
382
383
384 def add_table_of_themes(root):
385     try:
386         from sortify import sortify
387     except ImportError:
388         def sortify(x):
389             return x
390
391     book_themes = {}
392     for fragment in root.findall('.//a[@class="theme-begin"]'):
393         if not fragment.text:
394             continue
395         theme_names = [s.strip() for s in fragment.text.split(',')]
396         for theme_name in theme_names:
397             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
398     book_themes = list(book_themes.items())
399     book_themes.sort(key=lambda s: sortify(s[0]))
400     themes_div = etree.Element('div', id="themes")
401     themes_ol = etree.SubElement(themes_div, 'ol')
402     for theme_name, fragments in book_themes:
403         themes_li = etree.SubElement(themes_ol, 'li')
404         themes_li.text = "%s: " % theme_name
405         for i, fragment in enumerate(fragments):
406             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
407             item.text = str(i + 1)
408             item.tail = ' '
409     root.insert(0, themes_div)
410
411
412 def extract_annotations(html_path):
413     """Extracts annotations from HTML for annotations dictionary.
414
415     For each annotation, yields a tuple of:
416     anchor, footnote type, valid qualifiers, text, html.
417
418     """
419     from .fn_qualifiers import FN_QUALIFIERS
420
421     parser = etree.HTMLParser(encoding='utf-8')
422     tree = etree.parse(html_path, parser)
423     footnotes = tree.find('//*[@id="footnotes"]')
424     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
425     if footnotes is not None:
426         for footnote in footnotes.findall('div'):
427             fn_type = footnote.get('class').split('-')[1]
428             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
429             del footnote[:2]
430             footnote.text = None
431             if len(footnote) and footnote[-1].tail == '\n':
432                 footnote[-1].tail = None
433             text_str = etree.tostring(footnote, method='text',
434                                       encoding='unicode').strip()
435             html_str = etree.tostring(footnote, method='html',
436                                       encoding='unicode').strip()
437
438             match = re_qualifier.match(text_str)
439             if match:
440                 qualifier_str = match.group(1)
441                 qualifiers = []
442                 for candidate in re.split('[;,]', qualifier_str):
443                     candidate = candidate.strip()
444                     if candidate in FN_QUALIFIERS:
445                         qualifiers.append(candidate)
446                     elif candidate.startswith('z '):
447                         subcandidate = candidate.split()[1]
448                         if subcandidate in FN_QUALIFIERS:
449                             qualifiers.append(subcandidate)
450             else:
451                 qualifiers = []
452
453             yield anchor, fn_type, qualifiers, text_str, html_str