9ec6583f7d5725a6b42bdb8ea7f3554ac21fdf7f
[librarian.git] / src / librarian / html.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import print_function, unicode_literals
7
8 import os
9 import re
10 import copy
11
12 from lxml import etree
13 from librarian import XHTMLNS, ParseError, OutputFile
14 from librarian import functions
15
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 import six
18
19
20 functions.reg_substitute_entities()
21 functions.reg_person_name()
22
23 STYLESHEETS = {
24     'legacy': 'xslt/book2html.xslt',
25     'full': 'xslt/wl2html_full.xslt',
26     'partial': 'xslt/wl2html_partial.xslt'
27 }
28
29
30 def get_stylesheet(name):
31     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
32
33
34 def html_has_content(text):
35     return etree.ETXPath(
36         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
37     )(text)
38
39
40 def transform_abstrakt(abstrakt_element):
41     style_filename = get_stylesheet('legacy')
42     style = etree.parse(style_filename)
43     xml = etree.tostring(abstrakt_element, encoding='unicode')
44     document = etree.parse(six.StringIO(
45         xml.replace('abstrakt', 'dlugi_cytat')
46     ))  # HACK
47     result = document.xslt(style)
48     html = re.sub('<a name="sec[0-9]*"/>', '',
49                   etree.tostring(result, encoding='unicode'))
50     return re.sub('</?blockquote[^>]*>', '', html)
51
52
53 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None):
54     """Transforms the WL document to XHTML.
55
56     If output_filename is None, returns an XML,
57     otherwise returns True if file has been written,False if it hasn't.
58     File won't be written if it has no content.
59     """
60     # Parse XSLT
61     try:
62         style_filename = get_stylesheet(stylesheet)
63         style = etree.parse(style_filename)
64
65         document = copy.deepcopy(wldoc)
66         del wldoc
67         document.swap_endlines()
68
69         if flags:
70             for flag in flags:
71                 document.edoc.getroot().set(flag, 'yes')
72
73         document.clean_ed_note()
74         document.clean_ed_note('abstrakt')
75
76         if not options:
77             options = {}
78         options.setdefault('gallery', "''")
79
80         css = (
81             css
82             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
83         )
84         css = "'%s'" % css
85         result = document.transform(style, css=css, **options)
86         del document  # no longer needed large object :)
87
88         if html_has_content(result):
89             add_anchors(result.getroot())
90             add_table_of_themes(result.getroot())
91             add_table_of_contents(result.getroot())
92
93             return OutputFile.from_bytes(etree.tostring(
94                 result, method='html', xml_declaration=False,
95                 pretty_print=True, encoding='utf-8'
96             ))
97         else:
98             return None
99     except KeyError:
100         raise ValueError("'%s' is not a valid stylesheet.")
101     except (XMLSyntaxError, XSLTApplyError) as e:
102         raise ParseError(e)
103
104
105 @six.python_2_unicode_compatible
106 class Fragment(object):
107     def __init__(self, id, themes):
108         super(Fragment, self).__init__()
109         self.id = id
110         self.themes = themes
111         self.events = []
112
113     def append(self, event, element):
114         self.events.append((event, element))
115
116     def closed_events(self):
117         stack = []
118         for event, element in self.events:
119             if event == 'start':
120                 stack.append(('end', element))
121             elif event == 'end':
122                 try:
123                     stack.pop()
124                 except IndexError:
125                     print('CLOSED NON-OPEN TAG:', element)
126
127         stack.reverse()
128         return self.events + stack
129
130     def to_string(self):
131         result = []
132         for event, element in self.closed_events():
133             if event == 'start':
134                 result.append(u'<%s %s>' % (
135                     element.tag,
136                     ' '.join(
137                         '%s="%s"' % (k, v)
138                         for k, v in element.attrib.items()
139                     )
140                 ))
141                 if element.text:
142                     result.append(element.text)
143             elif event == 'end':
144                 result.append(u'</%s>' % element.tag)
145                 if element.tail:
146                     result.append(element.tail)
147             else:
148                 result.append(element)
149
150         return ''.join(result)
151
152     def __str__(self):
153         return self.to_string()
154
155
156 def extract_fragments(input_filename):
157     """Extracts theme fragments from input_filename."""
158     open_fragments = {}
159     closed_fragments = {}
160
161     # iterparse would die on a HTML document
162     parser = etree.HTMLParser(encoding='utf-8')
163     buf = six.BytesIO()
164     buf.write(etree.tostring(
165         etree.parse(input_filename, parser).getroot()[0][0],
166         encoding='utf-8'
167     ))
168     buf.seek(0)
169
170     for event, element in etree.iterparse(buf, events=('start', 'end')):
171         # Process begin and end elements
172         if element.get('class', '') in ('theme-begin', 'theme-end'):
173             if not event == 'end':
174                 continue  # Process elements only once, on end event
175
176             # Open new fragment
177             if element.get('class', '') == 'theme-begin':
178                 fragment = Fragment(id=element.get('fid'), themes=element.text)
179
180                 # Append parents
181                 parent = element.getparent()
182                 parents = []
183                 while parent.get('id', None) != 'book-text':
184                     cparent = copy.deepcopy(parent)
185                     cparent.text = None
186                     if 'id' in cparent.attrib:
187                         del cparent.attrib['id']
188                     parents.append(cparent)
189                     parent = parent.getparent()
190
191                 parents.reverse()
192                 for parent in parents:
193                     fragment.append('start', parent)
194
195                 open_fragments[fragment.id] = fragment
196
197             # Close existing fragment
198             else:
199                 try:
200                     fragment = open_fragments[element.get('fid')]
201                 except KeyError:
202                     print('%s:closed not open fragment #%s' % (
203                         input_filename, element.get('fid')
204                     ))
205                 else:
206                     closed_fragments[fragment.id] = fragment
207                     del open_fragments[fragment.id]
208
209             # Append element tail to lost_text
210             # (we don't want to lose any text)
211             if element.tail:
212                 for fragment_id in open_fragments:
213                     open_fragments[fragment_id].append('text', element.tail)
214
215         # Process all elements except begin and end
216         else:
217             # Omit annotation tags
218             if (len(element.get('name', '')) or
219                     element.get('class', '') in ('annotation', 'anchor')):
220                 if event == 'end' and element.tail:
221                     for fragment_id in open_fragments:
222                         open_fragments[fragment_id].append(
223                             'text', element.tail
224                         )
225             else:
226                 for fragment_id in open_fragments:
227                     celem = copy.copy(element)
228                     if 'id' in celem.attrib:
229                         del celem.attrib['id']
230                     open_fragments[fragment_id].append(
231                         event, celem
232                     )
233
234     return closed_fragments, open_fragments
235
236
237 def add_anchor(element, prefix, with_link=True, with_target=True,
238                link_text=None):
239     parent = element.getparent()
240     index = parent.index(element)
241
242     if with_link:
243         if link_text is None:
244             link_text = prefix
245         anchor = etree.Element('a', href='#%s' % prefix)
246         anchor.set('class', 'anchor')
247         anchor.text = six.text_type(link_text)
248         parent.insert(index, anchor)
249
250     if with_target:
251         anchor_target = etree.Element('a', name='%s' % prefix)
252         anchor_target.set('class', 'target')
253         anchor_target.text = u' '
254         parent.insert(index, anchor_target)
255
256
257 def any_ancestor(element, test):
258     for ancestor in element.iterancestors():
259         if test(ancestor):
260             return True
261     return False
262
263
264 def add_anchors(root):
265     counter = 1
266     for element in root.iterdescendants():
267         def f(e):
268             return (
269                 e.get('class') in (
270                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
271                 )
272                 or e.get('id') == 'nota_red'
273                 or e.tag == 'blockquote'
274             )
275         if any_ancestor(element, f):
276             continue
277
278         if element.tag == 'div' and 'verse' in element.get('class', ''):
279             if counter == 1 or counter % 5 == 0:
280                 add_anchor(element, "f%d" % counter, link_text=counter)
281             counter += 1
282         elif 'paragraph' in element.get('class', ''):
283             add_anchor(element, "f%d" % counter, link_text=counter)
284             counter += 1
285
286
287 def raw_printable_text(element):
288     working = copy.deepcopy(element)
289     for e in working.findall('a'):
290         if e.get('class') in ('annotation', 'theme-begin'):
291             e.text = ''
292     return etree.tostring(working, method='text', encoding='unicode').strip()
293
294
295 def add_table_of_contents(root):
296     sections = []
297     counter = 1
298     for element in root.iterdescendants():
299         if element.tag in ('h2', 'h3'):
300             if any_ancestor(
301                     element,
302                     lambda e: e.get('id') in (
303                         'footnotes', 'nota_red'
304                     ) or e.get('class') in ('person-list',)):
305                 continue
306
307             element_text = raw_printable_text(element)
308             if (element.tag == 'h3' and len(sections)
309                     and sections[-1][1] == 'h2'):
310                 sections[-1][3].append(
311                     (counter, element.tag, element_text, [])
312                 )
313             else:
314                 sections.append((counter, element.tag, element_text, []))
315             add_anchor(element, "s%d" % counter, with_link=False)
316             counter += 1
317
318     toc = etree.Element('div')
319     toc.set('id', 'toc')
320     toc_header = etree.SubElement(toc, 'h2')
321     toc_header.text = u'Spis treści'
322     toc_list = etree.SubElement(toc, 'ol')
323
324     for n, section, text, subsections in sections:
325         section_element = etree.SubElement(toc_list, 'li')
326         add_anchor(section_element, "s%d" % n, with_target=False,
327                    link_text=text)
328
329         if len(subsections):
330             subsection_list = etree.SubElement(section_element, 'ol')
331             for n1, subsection, subtext, _ in subsections:
332                 subsection_element = etree.SubElement(subsection_list, 'li')
333                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
334                            link_text=subtext)
335
336     root.insert(0, toc)
337
338
339 def add_table_of_themes(root):
340     try:
341         from sortify import sortify
342     except ImportError:
343         def sortify(x):
344             return x
345
346     book_themes = {}
347     for fragment in root.findall('.//a[@class="theme-begin"]'):
348         if not fragment.text:
349             continue
350         theme_names = [s.strip() for s in fragment.text.split(',')]
351         for theme_name in theme_names:
352             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
353     book_themes = list(book_themes.items())
354     book_themes.sort(key=lambda s: sortify(s[0]))
355     themes_div = etree.Element('div', id="themes")
356     themes_ol = etree.SubElement(themes_div, 'ol')
357     for theme_name, fragments in book_themes:
358         themes_li = etree.SubElement(themes_ol, 'li')
359         themes_li.text = "%s: " % theme_name
360         for i, fragment in enumerate(fragments):
361             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
362             item.text = str(i + 1)
363             item.tail = ' '
364     root.insert(0, themes_div)
365
366
367 def extract_annotations(html_path):
368     """Extracts annotations from HTML for annotations dictionary.
369
370     For each annotation, yields a tuple of:
371     anchor, footnote type, valid qualifiers, text, html.
372
373     """
374     from .fn_qualifiers import FN_QUALIFIERS
375
376     parser = etree.HTMLParser(encoding='utf-8')
377     tree = etree.parse(html_path, parser)
378     footnotes = tree.find('//*[@id="footnotes"]')
379     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
380     if footnotes is not None:
381         for footnote in footnotes.findall('div'):
382             fn_type = footnote.get('class').split('-')[1]
383             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
384             del footnote[:2]
385             footnote.text = None
386             if len(footnote) and footnote[-1].tail == '\n':
387                 footnote[-1].tail = None
388             text_str = etree.tostring(footnote, method='text',
389                                       encoding='unicode').strip()
390             html_str = etree.tostring(footnote, method='html',
391                                       encoding='unicode').strip()
392
393             match = re_qualifier.match(text_str)
394             if match:
395                 qualifier_str = match.group(1)
396                 qualifiers = []
397                 for candidate in re.split('[;,]', qualifier_str):
398                     candidate = candidate.strip()
399                     if candidate in FN_QUALIFIERS:
400                         qualifiers.append(candidate)
401                     elif candidate.startswith('z '):
402                         subcandidate = candidate.split()[1]
403                         if subcandidate in FN_QUALIFIERS:
404                             qualifiers.append(subcandidate)
405             else:
406                 qualifiers = []
407
408             yield anchor, fn_type, qualifiers, text_str, html_str