New version with the new EPUB converter.
[librarian.git] / src / librarian / html.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import print_function, unicode_literals
7
8 import os
9 import re
10 import copy
11
12 from lxml import etree
13 from librarian import XHTMLNS, ParseError, OutputFile
14 from librarian import functions
15 from PIL import Image
16
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
18 import six
19
20
21 functions.reg_substitute_entities()
22 functions.reg_person_name()
23
24 STYLESHEETS = {
25     'legacy': 'xslt/book2html.xslt',
26     'full': 'xslt/wl2html_full.xslt',
27     'partial': 'xslt/wl2html_partial.xslt'
28 }
29
30
31 def get_stylesheet(name):
32     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
33
34
35 def html_has_content(text):
36     return etree.ETXPath(
37         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
38     )(text)
39
40
41 def transform_abstrakt(abstrakt_element):
42     style_filename = get_stylesheet('legacy')
43     style = etree.parse(style_filename)
44     xml = etree.tostring(abstrakt_element, encoding='unicode')
45     document = etree.parse(six.StringIO(
46         xml.replace('abstrakt', 'dlugi_cytat')
47     ))  # HACK
48     result = document.xslt(style)
49     html = re.sub('<a name="sec[0-9]*"/>', '',
50                   etree.tostring(result, encoding='unicode'))
51     return re.sub('</?blockquote[^>]*>', '', html)
52
53
54 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
55     widths = [360, 600, 1200, 1800, 2400]
56
57     for i, ilustr in enumerate(tree.findall('//ilustr')):
58         rel_path = ilustr.attrib['src']
59         img_url = six.moves.urllib.parse.urljoin(base_url, rel_path)
60
61         f = six.moves.urllib.request.urlopen(img_url)
62         img = Image.open(f)
63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
64
65         srcset = []
66         # Needed widths: predefined and original, limited by
67         # whichever is smaller.
68         img_widths = [
69             w for w in
70             sorted(
71                 set(widths + [img.size[0]])
72             )
73             if w <= min(widths[-1], img.size[0])
74         ]
75         largest = None
76         for w in widths:
77             fname = '%d.W%d.%s' % (i, w, ext)
78             fpath = gallery_path + fname
79             if not os.path.exists(fpath):
80                 height = round(img.size[1] * w / img.size[0])
81                 th = img.resize((w, height))
82                 th.save(fpath)
83             th_url = gallery_url + fname
84             srcset.append(" ".join((
85                 th_url,
86                 '%dw' % w
87             )))
88             largest_url = th_url
89         ilustr.attrib['srcset'] = ", ".join(srcset)
90         ilustr.attrib['src'] = largest_url
91
92         f.close()
93
94
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96     """Transforms the WL document to XHTML.
97
98     If output_filename is None, returns an XML,
99     otherwise returns True if file has been written,False if it hasn't.
100     File won't be written if it has no content.
101     """
102     # Parse XSLT
103     try:
104         style_filename = get_stylesheet(stylesheet)
105         style = etree.parse(style_filename)
106
107         document = copy.deepcopy(wldoc)
108         del wldoc
109         document.swap_endlines()
110
111         if flags:
112             for flag in flags:
113                 document.edoc.getroot().set(flag, 'yes')
114
115         document.clean_ed_note()
116         document.clean_ed_note('abstrakt')
117
118         if not options:
119             options = {}
120
121         try:
122             os.makedirs(gallery_path)
123         except OSError:
124             pass
125
126         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
127
128         css = (
129             css
130             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
131         )
132         css = "'%s'" % css
133         result = document.transform(style, css=css, **options)
134         del document  # no longer needed large object :)
135
136         if html_has_content(result):
137             add_anchors(result.getroot())
138             add_table_of_themes(result.getroot())
139             add_table_of_contents(result.getroot())
140
141             return OutputFile.from_bytes(etree.tostring(
142                 result, method='html', xml_declaration=False,
143                 pretty_print=True, encoding='utf-8'
144             ))
145         else:
146             return None
147     except KeyError:
148         raise ValueError("'%s' is not a valid stylesheet.")
149     except (XMLSyntaxError, XSLTApplyError) as e:
150         raise ParseError(e)
151
152
153 @six.python_2_unicode_compatible
154 class Fragment(object):
155     def __init__(self, id, themes):
156         super(Fragment, self).__init__()
157         self.id = id
158         self.themes = themes
159         self.events = []
160
161     def append(self, event, element):
162         self.events.append((event, element))
163
164     def closed_events(self):
165         stack = []
166         for event, element in self.events:
167             if event == 'start':
168                 stack.append(('end', element))
169             elif event == 'end':
170                 try:
171                     stack.pop()
172                 except IndexError:
173                     print('CLOSED NON-OPEN TAG:', element)
174
175         stack.reverse()
176         return self.events + stack
177
178     def to_string(self):
179         result = []
180         for event, element in self.closed_events():
181             if event == 'start':
182                 result.append(u'<%s %s>' % (
183                     element.tag,
184                     ' '.join(
185                         '%s="%s"' % (k, v)
186                         for k, v in element.attrib.items()
187                     )
188                 ))
189                 if element.text:
190                     result.append(element.text)
191             elif event == 'end':
192                 result.append(u'</%s>' % element.tag)
193                 if element.tail:
194                     result.append(element.tail)
195             else:
196                 result.append(element)
197
198         return ''.join(result)
199
200     def __str__(self):
201         return self.to_string()
202
203
204 def extract_fragments(input_filename):
205     """Extracts theme fragments from input_filename."""
206     open_fragments = {}
207     closed_fragments = {}
208
209     # iterparse would die on a HTML document
210     parser = etree.HTMLParser(encoding='utf-8')
211     buf = six.BytesIO()
212     buf.write(etree.tostring(
213         etree.parse(input_filename, parser).getroot()[0][0],
214         encoding='utf-8'
215     ))
216     buf.seek(0)
217
218     for event, element in etree.iterparse(buf, events=('start', 'end')):
219         # Process begin and end elements
220         if element.get('class', '') in ('theme-begin', 'theme-end'):
221             if not event == 'end':
222                 continue  # Process elements only once, on end event
223
224             # Open new fragment
225             if element.get('class', '') == 'theme-begin':
226                 fragment = Fragment(id=element.get('fid'), themes=element.text)
227
228                 # Append parents
229                 parent = element.getparent()
230                 parents = []
231                 while parent.get('id', None) != 'book-text':
232                     cparent = copy.deepcopy(parent)
233                     cparent.text = None
234                     if 'id' in cparent.attrib:
235                         del cparent.attrib['id']
236                     parents.append(cparent)
237                     parent = parent.getparent()
238
239                 parents.reverse()
240                 for parent in parents:
241                     fragment.append('start', parent)
242
243                 open_fragments[fragment.id] = fragment
244
245             # Close existing fragment
246             else:
247                 try:
248                     fragment = open_fragments[element.get('fid')]
249                 except KeyError:
250                     print('%s:closed not open fragment #%s' % (
251                         input_filename, element.get('fid')
252                     ))
253                 else:
254                     closed_fragments[fragment.id] = fragment
255                     del open_fragments[fragment.id]
256
257             # Append element tail to lost_text
258             # (we don't want to lose any text)
259             if element.tail:
260                 for fragment_id in open_fragments:
261                     open_fragments[fragment_id].append('text', element.tail)
262
263         # Process all elements except begin and end
264         else:
265             # Omit annotation tags
266             if (len(element.get('name', '')) or
267                     element.get('class', '') in ('annotation', 'anchor')):
268                 if event == 'end' and element.tail:
269                     for fragment_id in open_fragments:
270                         open_fragments[fragment_id].append(
271                             'text', element.tail
272                         )
273             else:
274                 for fragment_id in open_fragments:
275                     celem = copy.copy(element)
276                     if 'id' in celem.attrib:
277                         del celem.attrib['id']
278                     open_fragments[fragment_id].append(
279                         event, celem
280                     )
281
282     return closed_fragments, open_fragments
283
284
285 def add_anchor(element, prefix, with_link=True, with_target=True,
286                link_text=None):
287     parent = element.getparent()
288     index = parent.index(element)
289
290     if with_link:
291         if link_text is None:
292             link_text = prefix
293         anchor = etree.Element('a', href='#%s' % prefix)
294         anchor.set('class', 'anchor')
295         anchor.text = six.text_type(link_text)
296         parent.insert(index, anchor)
297
298     if with_target:
299         anchor_target = etree.Element('a', name='%s' % prefix)
300         anchor_target.set('class', 'target')
301         anchor_target.text = u' '
302         parent.insert(index, anchor_target)
303
304
305 def any_ancestor(element, test):
306     for ancestor in element.iterancestors():
307         if test(ancestor):
308             return True
309     return False
310
311
312 def add_anchors(root):
313     counter = 1
314     for element in root.iterdescendants():
315         def f(e):
316             return (
317                 e.get('class') in (
318                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
319                 )
320                 or e.get('id') == 'nota_red'
321                 or e.tag == 'blockquote'
322             )
323         if any_ancestor(element, f):
324             continue
325
326         if element.tag == 'div' and 'verse' in element.get('class', ''):
327             if counter == 1 or counter % 5 == 0:
328                 add_anchor(element, "f%d" % counter, link_text=counter)
329             counter += 1
330         elif 'paragraph' in element.get('class', ''):
331             add_anchor(element, "f%d" % counter, link_text=counter)
332             counter += 1
333
334
335 def raw_printable_text(element):
336     working = copy.deepcopy(element)
337     for e in working.findall('a'):
338         if e.get('class') in ('annotation', 'theme-begin'):
339             e.text = ''
340     return etree.tostring(working, method='text', encoding='unicode').strip()
341
342
343 def add_table_of_contents(root):
344     sections = []
345     counter = 1
346     for element in root.iterdescendants():
347         if element.tag in ('h2', 'h3'):
348             if any_ancestor(
349                     element,
350                     lambda e: e.get('id') in (
351                         'footnotes', 'nota_red'
352                     ) or e.get('class') in ('person-list',)):
353                 continue
354
355             element_text = raw_printable_text(element)
356             if (element.tag == 'h3' and len(sections)
357                     and sections[-1][1] == 'h2'):
358                 sections[-1][3].append(
359                     (counter, element.tag, element_text, [])
360                 )
361             else:
362                 sections.append((counter, element.tag, element_text, []))
363             add_anchor(element, "s%d" % counter, with_link=False)
364             counter += 1
365
366     toc = etree.Element('div')
367     toc.set('id', 'toc')
368     toc_header = etree.SubElement(toc, 'h2')
369     toc_header.text = u'Spis treści'
370     toc_list = etree.SubElement(toc, 'ol')
371
372     for n, section, text, subsections in sections:
373         section_element = etree.SubElement(toc_list, 'li')
374         add_anchor(section_element, "s%d" % n, with_target=False,
375                    link_text=text)
376
377         if len(subsections):
378             subsection_list = etree.SubElement(section_element, 'ol')
379             for n1, subsection, subtext, _ in subsections:
380                 subsection_element = etree.SubElement(subsection_list, 'li')
381                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
382                            link_text=subtext)
383
384     root.insert(0, toc)
385
386
387 def add_table_of_themes(root):
388     try:
389         from sortify import sortify
390     except ImportError:
391         def sortify(x):
392             return x
393
394     book_themes = {}
395     for fragment in root.findall('.//a[@class="theme-begin"]'):
396         if not fragment.text:
397             continue
398         theme_names = [s.strip() for s in fragment.text.split(',')]
399         for theme_name in theme_names:
400             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
401     book_themes = list(book_themes.items())
402     book_themes.sort(key=lambda s: sortify(s[0]))
403     themes_div = etree.Element('div', id="themes")
404     themes_ol = etree.SubElement(themes_div, 'ol')
405     for theme_name, fragments in book_themes:
406         themes_li = etree.SubElement(themes_ol, 'li')
407         themes_li.text = "%s: " % theme_name
408         for i, fragment in enumerate(fragments):
409             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
410             item.text = str(i + 1)
411             item.tail = ' '
412     root.insert(0, themes_div)
413
414
415 def extract_annotations(html_path):
416     """Extracts annotations from HTML for annotations dictionary.
417
418     For each annotation, yields a tuple of:
419     anchor, footnote type, valid qualifiers, text, html.
420
421     """
422     from .fn_qualifiers import FN_QUALIFIERS
423
424     parser = etree.HTMLParser(encoding='utf-8')
425     tree = etree.parse(html_path, parser)
426     footnotes = tree.find('//*[@id="footnotes"]')
427     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
428     if footnotes is not None:
429         for footnote in footnotes.findall('div'):
430             fn_type = footnote.get('class').split('-')[1]
431             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
432             del footnote[:2]
433             footnote.text = None
434             if len(footnote) and footnote[-1].tail == '\n':
435                 footnote[-1].tail = None
436             text_str = etree.tostring(footnote, method='text',
437                                       encoding='unicode').strip()
438             html_str = etree.tostring(footnote, method='html',
439                                       encoding='unicode').strip()
440
441             match = re_qualifier.match(text_str)
442             if match:
443                 qualifier_str = match.group(1)
444                 qualifiers = []
445                 for candidate in re.split('[;,]', qualifier_str):
446                     candidate = candidate.strip()
447                     if candidate in FN_QUALIFIERS:
448                         qualifiers.append(candidate)
449                     elif candidate.startswith('z '):
450                         subcandidate = candidate.split()[1]
451                         if subcandidate in FN_QUALIFIERS:
452                             qualifiers.append(subcandidate)
453             else:
454                 qualifiers = []
455
456             yield anchor, fn_type, qualifiers, text_str, html_str