librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import re
  10 import copy
  11
  12 from lxml import etree
  13 from librarian import XHTMLNS, ParseError, OutputFile
  14 from librarian import functions
  15
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
  17 import six
  18
  19
  20 functions.reg_substitute_entities()
  21 functions.reg_person_name()
  22
  23 STYLESHEETS = {
  24     'legacy': 'xslt/book2html.xslt',
  25     'full': 'xslt/wl2html_full.xslt',
  26     'partial': 'xslt/wl2html_partial.xslt'
  27 }
  28
  29
  30 def get_stylesheet(name):
  31     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  32
  33
  34 def html_has_content(text):
  35     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  36
  37
  38 def transform_abstrakt(abstrakt_element):
  39     style_filename = get_stylesheet('legacy')
  40     style = etree.parse(style_filename)
  41     xml = etree.tostring(abstrakt_element, encoding='unicode')
  42     document = etree.parse(six.StringIO(xml.replace('abstrakt', 'dlugi_cytat')))  # HACK
  43     result = document.xslt(style)
  44     html = re.sub('<a name="sec[0-9]*"/>', '', etree.tostring(result, encoding='unicode'))
  45     return re.sub('</?blockquote[^>]*>', '', html)
  46
  47
  48 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None):
  49     """Transforms the WL document to XHTML.
  50
  51     If output_filename is None, returns an XML,
  52     otherwise returns True if file has been written,False if it hasn't.
  53     File won't be written if it has no content.
  54     """
  55     # Parse XSLT
  56     try:
  57         style_filename = get_stylesheet(stylesheet)
  58         style = etree.parse(style_filename)
  59
  60         document = copy.deepcopy(wldoc)
  61         del wldoc
  62         document.swap_endlines()
  63
  64         if flags:
  65             for flag in flags:
  66                 document.edoc.getroot().set(flag, 'yes')
  67
  68         document.clean_ed_note()
  69         document.clean_ed_note('abstrakt')
  70
  71         if not options:
  72             options = {}
  73         options.setdefault('gallery', "''")
  74
  75         css = css or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
  76         css = "'%s'" % css
  77         result = document.transform(style, css=css, **options)
  78         del document  # no longer needed large object :)
  79
  80         if html_has_content(result):
  81             add_anchors(result.getroot())
  82             add_table_of_themes(result.getroot())
  83             add_table_of_contents(result.getroot())
  84
  85             return OutputFile.from_bytes(etree.tostring(
  86                 result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
  87         else:
  88             return None
  89     except KeyError:
  90         raise ValueError("'%s' is not a valid stylesheet.")
  91     except (XMLSyntaxError, XSLTApplyError) as e:
  92         raise ParseError(e)
  93
  94
  95 @six.python_2_unicode_compatible
  96 class Fragment(object):
  97     def __init__(self, id, themes):
  98         super(Fragment, self).__init__()
  99         self.id = id
 100         self.themes = themes
 101         self.events = []
 102
 103     def append(self, event, element):
 104         self.events.append((event, element))
 105
 106     def closed_events(self):
 107         stack = []
 108         for event, element in self.events:
 109             if event == 'start':
 110                 stack.append(('end', element))
 111             elif event == 'end':
 112                 try:
 113                     stack.pop()
 114                 except IndexError:
 115                     print('CLOSED NON-OPEN TAG:', element)
 116
 117         stack.reverse()
 118         return self.events + stack
 119
 120     def to_string(self):
 121         result = []
 122         for event, element in self.closed_events():
 123             if event == 'start':
 124                 result.append(u'<%s %s>' % (
 125                     element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 126                 if element.text:
 127                     result.append(element.text)
 128             elif event == 'end':
 129                 result.append(u'</%s>' % element.tag)
 130                 if element.tail:
 131                     result.append(element.tail)
 132             else:
 133                 result.append(element)
 134
 135         return ''.join(result)
 136
 137     def __str__(self):
 138         return self.to_string()
 139
 140
 141 def extract_fragments(input_filename):
 142     """Extracts theme fragments from input_filename."""
 143     open_fragments = {}
 144     closed_fragments = {}
 145
 146     # iterparse would die on a HTML document
 147     parser = etree.HTMLParser(encoding='utf-8')
 148     buf = six.BytesIO()
 149     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 150     buf.seek(0)
 151
 152     for event, element in etree.iterparse(buf, events=('start', 'end')):
 153         # Process begin and end elements
 154         if element.get('class', '') in ('theme-begin', 'theme-end'):
 155             if not event == 'end':
 156                 continue  # Process elements only once, on end event
 157
 158             # Open new fragment
 159             if element.get('class', '') == 'theme-begin':
 160                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 161
 162                 # Append parents
 163                 parent = element.getparent()
 164                 parents = []
 165                 while parent.get('id', None) != 'book-text':
 166                     cparent = copy.deepcopy(parent)
 167                     cparent.text = None
 168                     parents.append(cparent)
 169                     parent = parent.getparent()
 170
 171                 parents.reverse()
 172                 for parent in parents:
 173                     fragment.append('start', parent)
 174
 175                 open_fragments[fragment.id] = fragment
 176
 177             # Close existing fragment
 178             else:
 179                 try:
 180                     fragment = open_fragments[element.get('fid')]
 181                 except KeyError:
 182                     print('%s:closed not open fragment #%s' % (input_filename, element.get('fid')))
 183                 else:
 184                     closed_fragments[fragment.id] = fragment
 185                     del open_fragments[fragment.id]
 186
 187             # Append element tail to lost_text (we don't want to lose any text)
 188             if element.tail:
 189                 for fragment_id in open_fragments:
 190                     open_fragments[fragment_id].append('text', element.tail)
 191
 192         # Process all elements except begin and end
 193         else:
 194             # Omit annotation tags
 195             if (len(element.get('name', '')) or
 196                     element.get('class', '') in ('annotation', 'anchor')):
 197                 if event == 'end' and element.tail:
 198                     for fragment_id in open_fragments:
 199                         open_fragments[fragment_id].append('text', element.tail)
 200             else:
 201                 for fragment_id in open_fragments:
 202                     open_fragments[fragment_id].append(event, copy.copy(element))
 203
 204     return closed_fragments, open_fragments
 205
 206
 207 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 208     parent = element.getparent()
 209     index = parent.index(element)
 210
 211     if with_link:
 212         if link_text is None:
 213             link_text = prefix
 214         anchor = etree.Element('a', href='#%s' % prefix)
 215         anchor.set('class', 'anchor')
 216         anchor.text = six.text_type(link_text)
 217         parent.insert(index, anchor)
 218
 219     if with_target:
 220         anchor_target = etree.Element('a', name='%s' % prefix)
 221         anchor_target.set('class', 'target')
 222         anchor_target.text = u' '
 223         parent.insert(index, anchor_target)
 224
 225
 226 def any_ancestor(element, test):
 227     for ancestor in element.iterancestors():
 228         if test(ancestor):
 229             return True
 230     return False
 231
 232
 233 def add_anchors(root):
 234     counter = 1
 235     for element in root.iterdescendants():
 236         def f(e):
 237             return e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication', 'frame') or \
 238                 e.get('id') == 'nota_red' or e.tag == 'blockquote'
 239         if any_ancestor(element, f):
 240             continue
 241
 242         if element.tag == 'p' and 'verse' in element.get('class', ''):
 243             if counter == 1 or counter % 5 == 0:
 244                 add_anchor(element, "f%d" % counter, link_text=counter)
 245             counter += 1
 246         elif 'paragraph' in element.get('class', ''):
 247             add_anchor(element, "f%d" % counter, link_text=counter)
 248             counter += 1
 249
 250
 251 def raw_printable_text(element):
 252     working = copy.deepcopy(element)
 253     for e in working.findall('a'):
 254         if e.get('class') in ('annotation', 'theme-begin'):
 255             e.text = ''
 256     return etree.tostring(working, method='text', encoding='unicode').strip()
 257
 258
 259 def add_table_of_contents(root):
 260     sections = []
 261     counter = 1
 262     for element in root.iterdescendants():
 263         if element.tag in ('h2', 'h3'):
 264             if any_ancestor(element,
 265                             lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
 266                 continue
 267
 268             element_text = raw_printable_text(element)
 269             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 270                 sections[-1][3].append((counter, element.tag, element_text, []))
 271             else:
 272                 sections.append((counter, element.tag, element_text, []))
 273             add_anchor(element, "s%d" % counter, with_link=False)
 274             counter += 1
 275
 276     toc = etree.Element('div')
 277     toc.set('id', 'toc')
 278     toc_header = etree.SubElement(toc, 'h2')
 279     toc_header.text = u'Spis treści'
 280     toc_list = etree.SubElement(toc, 'ol')
 281
 282     for n, section, text, subsections in sections:
 283         section_element = etree.SubElement(toc_list, 'li')
 284         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 285
 286         if len(subsections):
 287             subsection_list = etree.SubElement(section_element, 'ol')
 288             for n1, subsection, subtext, _ in subsections:
 289                 subsection_element = etree.SubElement(subsection_list, 'li')
 290                 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=subtext)
 291
 292     root.insert(0, toc)
 293
 294
 295 def add_table_of_themes(root):
 296     try:
 297         from sortify import sortify
 298     except ImportError:
 299         def sortify(x):
 300             return x
 301
 302     book_themes = {}
 303     for fragment in root.findall('.//a[@class="theme-begin"]'):
 304         if not fragment.text:
 305             continue
 306         theme_names = [s.strip() for s in fragment.text.split(',')]
 307         for theme_name in theme_names:
 308             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 309     book_themes = list(book_themes.items())
 310     book_themes.sort(key=lambda s: sortify(s[0]))
 311     themes_div = etree.Element('div', id="themes")
 312     themes_ol = etree.SubElement(themes_div, 'ol')
 313     for theme_name, fragments in book_themes:
 314         themes_li = etree.SubElement(themes_ol, 'li')
 315         themes_li.text = "%s: " % theme_name
 316         for i, fragment in enumerate(fragments):
 317             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 318             item.text = str(i + 1)
 319             item.tail = ' '
 320     root.insert(0, themes_div)
 321
 322
 323 def extract_annotations(html_path):
 324     """Extracts annotations from HTML for annotations dictionary.
 325
 326     For each annotation, yields a tuple of:
 327     anchor, footnote type, valid qualifiers, text, html.
 328
 329     """
 330     from .fn_qualifiers import FN_QUALIFIERS
 331
 332     parser = etree.HTMLParser(encoding='utf-8')
 333     tree = etree.parse(html_path, parser)
 334     footnotes = tree.find('//*[@id="footnotes"]')
 335     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 336     if footnotes is not None:
 337         for footnote in footnotes.findall('div'):
 338             fn_type = footnote.get('class').split('-')[1]
 339             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 340             del footnote[:2]
 341             footnote.text = None
 342             if len(footnote) and footnote[-1].tail == '\n':
 343                 footnote[-1].tail = None
 344             text_str = etree.tostring(footnote, method='text', encoding='unicode').strip()
 345             html_str = etree.tostring(footnote, method='html', encoding='unicode').strip()
 346
 347             match = re_qualifier.match(text_str)
 348             if match:
 349                 qualifier_str = match.group(1)
 350                 qualifiers = []
 351                 for candidate in re.split('[;,]', qualifier_str):
 352                     candidate = candidate.strip()
 353                     if candidate in FN_QUALIFIERS:
 354                         qualifiers.append(candidate)
 355                     elif candidate.startswith('z '):
 356                         subcandidate = candidate.split()[1]
 357                         if subcandidate in FN_QUALIFIERS:
 358                             qualifiers.append(subcandidate)
 359             else:
 360                 qualifiers = []
 361
 362             yield anchor, fn_type, qualifiers, text_str, html_str