d262198dedc7ec9335a905c15c27b473f2a0f1e0
[librarian.git] / src / librarian / html.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import print_function, unicode_literals
7
8 import os
9 import re
10 import copy
11
12 from lxml import etree
13 from librarian import XHTMLNS, ParseError, OutputFile
14 from librarian import functions
15 from PIL import Image
16
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
18 import six
19
20
21 functions.reg_substitute_entities()
22 functions.reg_person_name()
23
24 STYLESHEETS = {
25     'legacy': 'xslt/book2html.xslt',
26     'full': 'xslt/wl2html_full.xslt',
27     'partial': 'xslt/wl2html_partial.xslt'
28 }
29
30
31 def get_stylesheet(name):
32     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
33
34
35 def html_has_content(text):
36     return etree.ETXPath(
37         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
38     )(text)
39
40
41 def transform_abstrakt(abstrakt_element):
42     style_filename = get_stylesheet('legacy')
43     style = etree.parse(style_filename)
44     xml = etree.tostring(abstrakt_element, encoding='unicode')
45     document = etree.parse(six.StringIO(
46         xml.replace('abstrakt', 'dlugi_cytat')
47     ))  # HACK
48     result = document.xslt(style)
49     html = re.sub('<a name="sec[0-9]*"/>', '',
50                   etree.tostring(result, encoding='unicode'))
51     return re.sub('</?blockquote[^>]*>', '', html)
52
53
54 def add_image_sizes(tree, gallery_path, gallery_url):
55     widths = [360, 600, 1200, 1800]
56     for ilustr in tree.findall('//ilustr'):
57         rel_path = ilustr.attrib['src']
58         img = Image.open(gallery_path + rel_path)
59         srcset = []
60         for w in widths:
61             if w < img.size[0]:
62                 height = round(img.size[1] * w / img.size[0])
63                 th = img.resize((w, height))
64
65                 fname = ('.W%d.' % w).join(rel_path.rsplit('.', 1))
66                 th.save(gallery_path + fname)
67                 srcset.append(" ".join((
68                     gallery_url + fname,
69                     '%dw' % w
70                     )))
71         srcset.append(" ".join((
72             gallery_url + rel_path,
73             '%dw' % img.size[0]
74         )))
75         ilustr.attrib['srcset'] = ", ".join(srcset)
76         ilustr.attrib['src'] = gallery_url + rel_path
77
78
79 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/'):
80     """Transforms the WL document to XHTML.
81
82     If output_filename is None, returns an XML,
83     otherwise returns True if file has been written,False if it hasn't.
84     File won't be written if it has no content.
85     """
86     # Parse XSLT
87     try:
88         style_filename = get_stylesheet(stylesheet)
89         style = etree.parse(style_filename)
90
91         document = copy.deepcopy(wldoc)
92         del wldoc
93         document.swap_endlines()
94
95         if flags:
96             for flag in flags:
97                 document.edoc.getroot().set(flag, 'yes')
98
99         document.clean_ed_note()
100         document.clean_ed_note('abstrakt')
101
102         if not options:
103             options = {}
104
105         add_image_sizes(document.edoc, gallery_path, gallery_url)
106
107         css = (
108             css
109             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
110         )
111         css = "'%s'" % css
112         result = document.transform(style, css=css, **options)
113         del document  # no longer needed large object :)
114
115         if html_has_content(result):
116             add_anchors(result.getroot())
117             add_table_of_themes(result.getroot())
118             add_table_of_contents(result.getroot())
119
120             return OutputFile.from_bytes(etree.tostring(
121                 result, method='html', xml_declaration=False,
122                 pretty_print=True, encoding='utf-8'
123             ))
124         else:
125             return None
126     except KeyError:
127         raise ValueError("'%s' is not a valid stylesheet.")
128     except (XMLSyntaxError, XSLTApplyError) as e:
129         raise ParseError(e)
130
131
132 @six.python_2_unicode_compatible
133 class Fragment(object):
134     def __init__(self, id, themes):
135         super(Fragment, self).__init__()
136         self.id = id
137         self.themes = themes
138         self.events = []
139
140     def append(self, event, element):
141         self.events.append((event, element))
142
143     def closed_events(self):
144         stack = []
145         for event, element in self.events:
146             if event == 'start':
147                 stack.append(('end', element))
148             elif event == 'end':
149                 try:
150                     stack.pop()
151                 except IndexError:
152                     print('CLOSED NON-OPEN TAG:', element)
153
154         stack.reverse()
155         return self.events + stack
156
157     def to_string(self):
158         result = []
159         for event, element in self.closed_events():
160             if event == 'start':
161                 result.append(u'<%s %s>' % (
162                     element.tag,
163                     ' '.join(
164                         '%s="%s"' % (k, v)
165                         for k, v in element.attrib.items()
166                     )
167                 ))
168                 if element.text:
169                     result.append(element.text)
170             elif event == 'end':
171                 result.append(u'</%s>' % element.tag)
172                 if element.tail:
173                     result.append(element.tail)
174             else:
175                 result.append(element)
176
177         return ''.join(result)
178
179     def __str__(self):
180         return self.to_string()
181
182
183 def extract_fragments(input_filename):
184     """Extracts theme fragments from input_filename."""
185     open_fragments = {}
186     closed_fragments = {}
187
188     # iterparse would die on a HTML document
189     parser = etree.HTMLParser(encoding='utf-8')
190     buf = six.BytesIO()
191     buf.write(etree.tostring(
192         etree.parse(input_filename, parser).getroot()[0][0],
193         encoding='utf-8'
194     ))
195     buf.seek(0)
196
197     for event, element in etree.iterparse(buf, events=('start', 'end')):
198         # Process begin and end elements
199         if element.get('class', '') in ('theme-begin', 'theme-end'):
200             if not event == 'end':
201                 continue  # Process elements only once, on end event
202
203             # Open new fragment
204             if element.get('class', '') == 'theme-begin':
205                 fragment = Fragment(id=element.get('fid'), themes=element.text)
206
207                 # Append parents
208                 parent = element.getparent()
209                 parents = []
210                 while parent.get('id', None) != 'book-text':
211                     cparent = copy.deepcopy(parent)
212                     cparent.text = None
213                     if 'id' in cparent.attrib:
214                         del cparent.attrib['id']
215                     parents.append(cparent)
216                     parent = parent.getparent()
217
218                 parents.reverse()
219                 for parent in parents:
220                     fragment.append('start', parent)
221
222                 open_fragments[fragment.id] = fragment
223
224             # Close existing fragment
225             else:
226                 try:
227                     fragment = open_fragments[element.get('fid')]
228                 except KeyError:
229                     print('%s:closed not open fragment #%s' % (
230                         input_filename, element.get('fid')
231                     ))
232                 else:
233                     closed_fragments[fragment.id] = fragment
234                     del open_fragments[fragment.id]
235
236             # Append element tail to lost_text
237             # (we don't want to lose any text)
238             if element.tail:
239                 for fragment_id in open_fragments:
240                     open_fragments[fragment_id].append('text', element.tail)
241
242         # Process all elements except begin and end
243         else:
244             # Omit annotation tags
245             if (len(element.get('name', '')) or
246                     element.get('class', '') in ('annotation', 'anchor')):
247                 if event == 'end' and element.tail:
248                     for fragment_id in open_fragments:
249                         open_fragments[fragment_id].append(
250                             'text', element.tail
251                         )
252             else:
253                 for fragment_id in open_fragments:
254                     celem = copy.copy(element)
255                     if 'id' in celem.attrib:
256                         del celem.attrib['id']
257                     open_fragments[fragment_id].append(
258                         event, celem
259                     )
260
261     return closed_fragments, open_fragments
262
263
264 def add_anchor(element, prefix, with_link=True, with_target=True,
265                link_text=None):
266     parent = element.getparent()
267     index = parent.index(element)
268
269     if with_link:
270         if link_text is None:
271             link_text = prefix
272         anchor = etree.Element('a', href='#%s' % prefix)
273         anchor.set('class', 'anchor')
274         anchor.text = six.text_type(link_text)
275         parent.insert(index, anchor)
276
277     if with_target:
278         anchor_target = etree.Element('a', name='%s' % prefix)
279         anchor_target.set('class', 'target')
280         anchor_target.text = u' '
281         parent.insert(index, anchor_target)
282
283
284 def any_ancestor(element, test):
285     for ancestor in element.iterancestors():
286         if test(ancestor):
287             return True
288     return False
289
290
291 def add_anchors(root):
292     counter = 1
293     for element in root.iterdescendants():
294         def f(e):
295             return (
296                 e.get('class') in (
297                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
298                 )
299                 or e.get('id') == 'nota_red'
300                 or e.tag == 'blockquote'
301             )
302         if any_ancestor(element, f):
303             continue
304
305         if element.tag == 'div' and 'verse' in element.get('class', ''):
306             if counter == 1 or counter % 5 == 0:
307                 add_anchor(element, "f%d" % counter, link_text=counter)
308             counter += 1
309         elif 'paragraph' in element.get('class', ''):
310             add_anchor(element, "f%d" % counter, link_text=counter)
311             counter += 1
312
313
314 def raw_printable_text(element):
315     working = copy.deepcopy(element)
316     for e in working.findall('a'):
317         if e.get('class') in ('annotation', 'theme-begin'):
318             e.text = ''
319     return etree.tostring(working, method='text', encoding='unicode').strip()
320
321
322 def add_table_of_contents(root):
323     sections = []
324     counter = 1
325     for element in root.iterdescendants():
326         if element.tag in ('h2', 'h3'):
327             if any_ancestor(
328                     element,
329                     lambda e: e.get('id') in (
330                         'footnotes', 'nota_red'
331                     ) or e.get('class') in ('person-list',)):
332                 continue
333
334             element_text = raw_printable_text(element)
335             if (element.tag == 'h3' and len(sections)
336                     and sections[-1][1] == 'h2'):
337                 sections[-1][3].append(
338                     (counter, element.tag, element_text, [])
339                 )
340             else:
341                 sections.append((counter, element.tag, element_text, []))
342             add_anchor(element, "s%d" % counter, with_link=False)
343             counter += 1
344
345     toc = etree.Element('div')
346     toc.set('id', 'toc')
347     toc_header = etree.SubElement(toc, 'h2')
348     toc_header.text = u'Spis treści'
349     toc_list = etree.SubElement(toc, 'ol')
350
351     for n, section, text, subsections in sections:
352         section_element = etree.SubElement(toc_list, 'li')
353         add_anchor(section_element, "s%d" % n, with_target=False,
354                    link_text=text)
355
356         if len(subsections):
357             subsection_list = etree.SubElement(section_element, 'ol')
358             for n1, subsection, subtext, _ in subsections:
359                 subsection_element = etree.SubElement(subsection_list, 'li')
360                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
361                            link_text=subtext)
362
363     root.insert(0, toc)
364
365
366 def add_table_of_themes(root):
367     try:
368         from sortify import sortify
369     except ImportError:
370         def sortify(x):
371             return x
372
373     book_themes = {}
374     for fragment in root.findall('.//a[@class="theme-begin"]'):
375         if not fragment.text:
376             continue
377         theme_names = [s.strip() for s in fragment.text.split(',')]
378         for theme_name in theme_names:
379             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
380     book_themes = list(book_themes.items())
381     book_themes.sort(key=lambda s: sortify(s[0]))
382     themes_div = etree.Element('div', id="themes")
383     themes_ol = etree.SubElement(themes_div, 'ol')
384     for theme_name, fragments in book_themes:
385         themes_li = etree.SubElement(themes_ol, 'li')
386         themes_li.text = "%s: " % theme_name
387         for i, fragment in enumerate(fragments):
388             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
389             item.text = str(i + 1)
390             item.tail = ' '
391     root.insert(0, themes_div)
392
393
394 def extract_annotations(html_path):
395     """Extracts annotations from HTML for annotations dictionary.
396
397     For each annotation, yields a tuple of:
398     anchor, footnote type, valid qualifiers, text, html.
399
400     """
401     from .fn_qualifiers import FN_QUALIFIERS
402
403     parser = etree.HTMLParser(encoding='utf-8')
404     tree = etree.parse(html_path, parser)
405     footnotes = tree.find('//*[@id="footnotes"]')
406     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
407     if footnotes is not None:
408         for footnote in footnotes.findall('div'):
409             fn_type = footnote.get('class').split('-')[1]
410             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
411             del footnote[:2]
412             footnote.text = None
413             if len(footnote) and footnote[-1].tail == '\n':
414                 footnote[-1].tail = None
415             text_str = etree.tostring(footnote, method='text',
416                                       encoding='unicode').strip()
417             html_str = etree.tostring(footnote, method='html',
418                                       encoding='unicode').strip()
419
420             match = re_qualifier.match(text_str)
421             if match:
422                 qualifier_str = match.group(1)
423                 qualifiers = []
424                 for candidate in re.split('[;,]', qualifier_str):
425                     candidate = candidate.strip()
426                     if candidate in FN_QUALIFIERS:
427                         qualifiers.append(candidate)
428                     elif candidate.startswith('z '):
429                         subcandidate = candidate.split()[1]
430                         if subcandidate in FN_QUALIFIERS:
431                             qualifiers.append(subcandidate)
432             else:
433                 qualifiers = []
434
435             yield anchor, fn_type, qualifiers, text_str, html_str