78f3daddcb6537fe2d585e2b19aca3e86feb72b7
[librarian.git] / src / librarian / html.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import print_function, unicode_literals
7
8 import os
9 import re
10 import copy
11
12 from lxml import etree
13 from librarian import XHTMLNS, ParseError, OutputFile
14 from librarian import functions
15
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 import six
18
19
20 functions.reg_substitute_entities()
21 functions.reg_person_name()
22
23 STYLESHEETS = {
24     'legacy': 'xslt/book2html.xslt',
25     'full': 'xslt/wl2html_full.xslt',
26     'partial': 'xslt/wl2html_partial.xslt'
27 }
28
29
30 def get_stylesheet(name):
31     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
32
33
34 def html_has_content(text):
35     return etree.ETXPath(
36         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
37     )(text)
38
39
40 def transform_abstrakt(abstrakt_element):
41     style_filename = get_stylesheet('legacy')
42     style = etree.parse(style_filename)
43     xml = etree.tostring(abstrakt_element, encoding='unicode')
44     document = etree.parse(six.StringIO(
45         xml.replace('abstrakt', 'dlugi_cytat')
46     ))  # HACK
47     result = document.xslt(style)
48     html = re.sub('<a name="sec[0-9]*"/>', '',
49                   etree.tostring(result, encoding='unicode'))
50     return re.sub('</?blockquote[^>]*>', '', html)
51
52
53 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None):
54     """Transforms the WL document to XHTML.
55
56     If output_filename is None, returns an XML,
57     otherwise returns True if file has been written,False if it hasn't.
58     File won't be written if it has no content.
59     """
60     # Parse XSLT
61     try:
62         style_filename = get_stylesheet(stylesheet)
63         style = etree.parse(style_filename)
64
65         document = copy.deepcopy(wldoc)
66         del wldoc
67         document.swap_endlines()
68
69         if flags:
70             for flag in flags:
71                 document.edoc.getroot().set(flag, 'yes')
72
73         document.clean_ed_note()
74         document.clean_ed_note('abstrakt')
75
76         if not options:
77             options = {}
78         options.setdefault('gallery', "''")
79
80         css = (
81             css
82             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
83         )
84         css = "'%s'" % css
85         result = document.transform(style, css=css, **options)
86         del document  # no longer needed large object :)
87
88         if html_has_content(result):
89             add_anchors(result.getroot())
90             add_table_of_themes(result.getroot())
91             add_table_of_contents(result.getroot())
92
93             return OutputFile.from_bytes(etree.tostring(
94                 result, method='html', xml_declaration=False,
95                 pretty_print=True, encoding='utf-8'
96             ))
97         else:
98             return None
99     except KeyError:
100         raise ValueError("'%s' is not a valid stylesheet.")
101     except (XMLSyntaxError, XSLTApplyError) as e:
102         raise ParseError(e)
103
104
105 @six.python_2_unicode_compatible
106 class Fragment(object):
107     def __init__(self, id, themes):
108         super(Fragment, self).__init__()
109         self.id = id
110         self.themes = themes
111         self.events = []
112
113     def append(self, event, element):
114         self.events.append((event, element))
115
116     def closed_events(self):
117         stack = []
118         for event, element in self.events:
119             if event == 'start':
120                 stack.append(('end', element))
121             elif event == 'end':
122                 try:
123                     stack.pop()
124                 except IndexError:
125                     print('CLOSED NON-OPEN TAG:', element)
126
127         stack.reverse()
128         return self.events + stack
129
130     def to_string(self):
131         result = []
132         for event, element in self.closed_events():
133             if event == 'start':
134                 result.append(u'<%s %s>' % (
135                     element.tag,
136                     ' '.join(
137                         '%s="%s"' % (k, v)
138                         for k, v in element.attrib.items()
139                     )
140                 ))
141                 if element.text:
142                     result.append(element.text)
143             elif event == 'end':
144                 result.append(u'</%s>' % element.tag)
145                 if element.tail:
146                     result.append(element.tail)
147             else:
148                 result.append(element)
149
150         return ''.join(result)
151
152     def __str__(self):
153         return self.to_string()
154
155
156 def extract_fragments(input_filename):
157     """Extracts theme fragments from input_filename."""
158     open_fragments = {}
159     closed_fragments = {}
160
161     # iterparse would die on a HTML document
162     parser = etree.HTMLParser(encoding='utf-8')
163     buf = six.BytesIO()
164     buf.write(etree.tostring(
165         etree.parse(input_filename, parser).getroot()[0][0],
166         encoding='utf-8'
167     ))
168     buf.seek(0)
169
170     for event, element in etree.iterparse(buf, events=('start', 'end')):
171         # Process begin and end elements
172         if element.get('class', '') in ('theme-begin', 'theme-end'):
173             if not event == 'end':
174                 continue  # Process elements only once, on end event
175
176             # Open new fragment
177             if element.get('class', '') == 'theme-begin':
178                 fragment = Fragment(id=element.get('fid'), themes=element.text)
179
180                 # Append parents
181                 parent = element.getparent()
182                 parents = []
183                 while parent.get('id', None) != 'book-text':
184                     cparent = copy.deepcopy(parent)
185                     cparent.text = None
186                     parents.append(cparent)
187                     parent = parent.getparent()
188
189                 parents.reverse()
190                 for parent in parents:
191                     fragment.append('start', parent)
192
193                 open_fragments[fragment.id] = fragment
194
195             # Close existing fragment
196             else:
197                 try:
198                     fragment = open_fragments[element.get('fid')]
199                 except KeyError:
200                     print('%s:closed not open fragment #%s' % (
201                         input_filename, element.get('fid')
202                     ))
203                 else:
204                     closed_fragments[fragment.id] = fragment
205                     del open_fragments[fragment.id]
206
207             # Append element tail to lost_text
208             # (we don't want to lose any text)
209             if element.tail:
210                 for fragment_id in open_fragments:
211                     open_fragments[fragment_id].append('text', element.tail)
212
213         # Process all elements except begin and end
214         else:
215             # Omit annotation tags
216             if (len(element.get('name', '')) or
217                     element.get('class', '') in ('annotation', 'anchor')):
218                 if event == 'end' and element.tail:
219                     for fragment_id in open_fragments:
220                         open_fragments[fragment_id].append(
221                             'text', element.tail
222                         )
223             else:
224                 for fragment_id in open_fragments:
225                     open_fragments[fragment_id].append(
226                         event, copy.copy(element)
227                     )
228
229     return closed_fragments, open_fragments
230
231
232 def add_anchor(element, prefix, with_link=True, with_target=True,
233                link_text=None):
234     parent = element.getparent()
235     index = parent.index(element)
236
237     if with_link:
238         if link_text is None:
239             link_text = prefix
240         anchor = etree.Element('a', href='#%s' % prefix)
241         anchor.set('class', 'anchor')
242         anchor.text = six.text_type(link_text)
243         parent.insert(index, anchor)
244
245     if with_target:
246         anchor_target = etree.Element('a', name='%s' % prefix)
247         anchor_target.set('class', 'target')
248         anchor_target.text = u' '
249         parent.insert(index, anchor_target)
250
251
252 def any_ancestor(element, test):
253     for ancestor in element.iterancestors():
254         if test(ancestor):
255             return True
256     return False
257
258
259 def add_anchors(root):
260     counter = 1
261     for element in root.iterdescendants():
262         def f(e):
263             return (
264                 e.get('class') in (
265                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
266                 )
267                 or e.get('id') == 'nota_red'
268                 or e.tag == 'blockquote'
269             )
270         if any_ancestor(element, f):
271             continue
272
273         if element.tag == 'div' and 'verse' in element.get('class', ''):
274             if counter == 1 or counter % 5 == 0:
275                 add_anchor(element, "f%d" % counter, link_text=counter)
276             counter += 1
277         elif 'paragraph' in element.get('class', ''):
278             add_anchor(element, "f%d" % counter, link_text=counter)
279             counter += 1
280
281
282 def raw_printable_text(element):
283     working = copy.deepcopy(element)
284     for e in working.findall('a'):
285         if e.get('class') in ('annotation', 'theme-begin'):
286             e.text = ''
287     return etree.tostring(working, method='text', encoding='unicode').strip()
288
289
290 def add_table_of_contents(root):
291     sections = []
292     counter = 1
293     for element in root.iterdescendants():
294         if element.tag in ('h2', 'h3'):
295             if any_ancestor(
296                     element,
297                     lambda e: e.get('id') in (
298                         'footnotes', 'nota_red'
299                     ) or e.get('class') in ('person-list',)):
300                 continue
301
302             element_text = raw_printable_text(element)
303             if (element.tag == 'h3' and len(sections)
304                     and sections[-1][1] == 'h2'):
305                 sections[-1][3].append(
306                     (counter, element.tag, element_text, [])
307                 )
308             else:
309                 sections.append((counter, element.tag, element_text, []))
310             add_anchor(element, "s%d" % counter, with_link=False)
311             counter += 1
312
313     toc = etree.Element('div')
314     toc.set('id', 'toc')
315     toc_header = etree.SubElement(toc, 'h2')
316     toc_header.text = u'Spis treści'
317     toc_list = etree.SubElement(toc, 'ol')
318
319     for n, section, text, subsections in sections:
320         section_element = etree.SubElement(toc_list, 'li')
321         add_anchor(section_element, "s%d" % n, with_target=False,
322                    link_text=text)
323
324         if len(subsections):
325             subsection_list = etree.SubElement(section_element, 'ol')
326             for n1, subsection, subtext, _ in subsections:
327                 subsection_element = etree.SubElement(subsection_list, 'li')
328                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
329                            link_text=subtext)
330
331     root.insert(0, toc)
332
333
334 def add_table_of_themes(root):
335     try:
336         from sortify import sortify
337     except ImportError:
338         def sortify(x):
339             return x
340
341     book_themes = {}
342     for fragment in root.findall('.//a[@class="theme-begin"]'):
343         if not fragment.text:
344             continue
345         theme_names = [s.strip() for s in fragment.text.split(',')]
346         for theme_name in theme_names:
347             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
348     book_themes = list(book_themes.items())
349     book_themes.sort(key=lambda s: sortify(s[0]))
350     themes_div = etree.Element('div', id="themes")
351     themes_ol = etree.SubElement(themes_div, 'ol')
352     for theme_name, fragments in book_themes:
353         themes_li = etree.SubElement(themes_ol, 'li')
354         themes_li.text = "%s: " % theme_name
355         for i, fragment in enumerate(fragments):
356             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
357             item.text = str(i + 1)
358             item.tail = ' '
359     root.insert(0, themes_div)
360
361
362 def extract_annotations(html_path):
363     """Extracts annotations from HTML for annotations dictionary.
364
365     For each annotation, yields a tuple of:
366     anchor, footnote type, valid qualifiers, text, html.
367
368     """
369     from .fn_qualifiers import FN_QUALIFIERS
370
371     parser = etree.HTMLParser(encoding='utf-8')
372     tree = etree.parse(html_path, parser)
373     footnotes = tree.find('//*[@id="footnotes"]')
374     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
375     if footnotes is not None:
376         for footnote in footnotes.findall('div'):
377             fn_type = footnote.get('class').split('-')[1]
378             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
379             del footnote[:2]
380             footnote.text = None
381             if len(footnote) and footnote[-1].tail == '\n':
382                 footnote[-1].tail = None
383             text_str = etree.tostring(footnote, method='text',
384                                       encoding='unicode').strip()
385             html_str = etree.tostring(footnote, method='html',
386                                       encoding='unicode').strip()
387
388             match = re_qualifier.match(text_str)
389             if match:
390                 qualifier_str = match.group(1)
391                 qualifiers = []
392                 for candidate in re.split('[;,]', qualifier_str):
393                     candidate = candidate.strip()
394                     if candidate in FN_QUALIFIERS:
395                         qualifiers.append(candidate)
396                     elif candidate.startswith('z '):
397                         subcandidate = candidate.split()[1]
398                         if subcandidate in FN_QUALIFIERS:
399                             qualifiers.append(subcandidate)
400             else:
401                 qualifiers = []
402
403             yield anchor, fn_type, qualifiers, text_str, html_str