src/librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import re
  10 import copy
  11
  12 from lxml import etree
  13 from librarian import XHTMLNS, ParseError, OutputFile
  14 from librarian import functions
  15
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
  17 import six
  18
  19
  20 functions.reg_substitute_entities()
  21 functions.reg_person_name()
  22
  23 STYLESHEETS = {
  24     'legacy': 'xslt/book2html.xslt',
  25     'full': 'xslt/wl2html_full.xslt',
  26     'partial': 'xslt/wl2html_partial.xslt'
  27 }
  28
  29
  30 def get_stylesheet(name):
  31     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  32
  33
  34 def html_has_content(text):
  35     return etree.ETXPath(
  36         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
  37     )(text)
  38
  39
  40 def transform_abstrakt(abstrakt_element):
  41     style_filename = get_stylesheet('legacy')
  42     style = etree.parse(style_filename)
  43     xml = etree.tostring(abstrakt_element, encoding='unicode')
  44     document = etree.parse(six.StringIO(
  45         xml.replace('abstrakt', 'dlugi_cytat')
  46     ))  # HACK
  47     result = document.xslt(style)
  48     html = re.sub('<a name="sec[0-9]*"/>', '',
  49                   etree.tostring(result, encoding='unicode'))
  50     return re.sub('</?blockquote[^>]*>', '', html)
  51
  52
  53 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None):
  54     """Transforms the WL document to XHTML.
  55
  56     If output_filename is None, returns an XML,
  57     otherwise returns True if file has been written,False if it hasn't.
  58     File won't be written if it has no content.
  59     """
  60     # Parse XSLT
  61     try:
  62         style_filename = get_stylesheet(stylesheet)
  63         style = etree.parse(style_filename)
  64
  65         document = copy.deepcopy(wldoc)
  66         del wldoc
  67         document.swap_endlines()
  68
  69         if flags:
  70             for flag in flags:
  71                 document.edoc.getroot().set(flag, 'yes')
  72
  73         document.clean_ed_note()
  74         document.clean_ed_note('abstrakt')
  75
  76         if not options:
  77             options = {}
  78         options.setdefault('gallery', "''")
  79
  80         css = (
  81             css
  82             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
  83         )
  84         css = "'%s'" % css
  85         result = document.transform(style, css=css, **options)
  86         del document  # no longer needed large object :)
  87
  88         if html_has_content(result):
  89             add_anchors(result.getroot())
  90             add_table_of_themes(result.getroot())
  91             add_table_of_contents(result.getroot())
  92
  93             return OutputFile.from_bytes(etree.tostring(
  94                 result, method='html', xml_declaration=False,
  95                 pretty_print=True, encoding='utf-8'
  96             ))
  97         else:
  98             return None
  99     except KeyError:
 100         raise ValueError("'%s' is not a valid stylesheet.")
 101     except (XMLSyntaxError, XSLTApplyError) as e:
 102         raise ParseError(e)
 103
 104
 105 @six.python_2_unicode_compatible
 106 class Fragment(object):
 107     def __init__(self, id, themes):
 108         super(Fragment, self).__init__()
 109         self.id = id
 110         self.themes = themes
 111         self.events = []
 112
 113     def append(self, event, element):
 114         self.events.append((event, element))
 115
 116     def closed_events(self):
 117         stack = []
 118         for event, element in self.events:
 119             if event == 'start':
 120                 stack.append(('end', element))
 121             elif event == 'end':
 122                 try:
 123                     stack.pop()
 124                 except IndexError:
 125                     print('CLOSED NON-OPEN TAG:', element)
 126
 127         stack.reverse()
 128         return self.events + stack
 129
 130     def to_string(self):
 131         result = []
 132         for event, element in self.closed_events():
 133             if event == 'start':
 134                 result.append(u'<%s %s>' % (
 135                     element.tag,
 136                     ' '.join(
 137                         '%s="%s"' % (k, v)
 138                         for k, v in element.attrib.items()
 139                     )
 140                 ))
 141                 if element.text:
 142                     result.append(element.text)
 143             elif event == 'end':
 144                 result.append(u'</%s>' % element.tag)
 145                 if element.tail:
 146                     result.append(element.tail)
 147             else:
 148                 result.append(element)
 149
 150         return ''.join(result)
 151
 152     def __str__(self):
 153         return self.to_string()
 154
 155
 156 def extract_fragments(input_filename):
 157     """Extracts theme fragments from input_filename."""
 158     open_fragments = {}
 159     closed_fragments = {}
 160
 161     # iterparse would die on a HTML document
 162     parser = etree.HTMLParser(encoding='utf-8')
 163     buf = six.BytesIO()
 164     buf.write(etree.tostring(
 165         etree.parse(input_filename, parser).getroot()[0][0],
 166         encoding='utf-8'
 167     ))
 168     buf.seek(0)
 169
 170     for event, element in etree.iterparse(buf, events=('start', 'end')):
 171         # Process begin and end elements
 172         if element.get('class', '') in ('theme-begin', 'theme-end'):
 173             if not event == 'end':
 174                 continue  # Process elements only once, on end event
 175
 176             # Open new fragment
 177             if element.get('class', '') == 'theme-begin':
 178                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 179
 180                 # Append parents
 181                 parent = element.getparent()
 182                 parents = []
 183                 while parent.get('id', None) != 'book-text':
 184                     cparent = copy.deepcopy(parent)
 185                     cparent.text = None
 186                     parents.append(cparent)
 187                     parent = parent.getparent()
 188
 189                 parents.reverse()
 190                 for parent in parents:
 191                     fragment.append('start', parent)
 192
 193                 open_fragments[fragment.id] = fragment
 194
 195             # Close existing fragment
 196             else:
 197                 try:
 198                     fragment = open_fragments[element.get('fid')]
 199                 except KeyError:
 200                     print('%s:closed not open fragment #%s' % (
 201                         input_filename, element.get('fid')
 202                     ))
 203                 else:
 204                     closed_fragments[fragment.id] = fragment
 205                     del open_fragments[fragment.id]
 206
 207             # Append element tail to lost_text
 208             # (we don't want to lose any text)
 209             if element.tail:
 210                 for fragment_id in open_fragments:
 211                     open_fragments[fragment_id].append('text', element.tail)
 212
 213         # Process all elements except begin and end
 214         else:
 215             # Omit annotation tags
 216             if (len(element.get('name', '')) or
 217                     element.get('class', '') in ('annotation', 'anchor')):
 218                 if event == 'end' and element.tail:
 219                     for fragment_id in open_fragments:
 220                         open_fragments[fragment_id].append(
 221                             'text', element.tail
 222                         )
 223             else:
 224                 for fragment_id in open_fragments:
 225                     open_fragments[fragment_id].append(
 226                         event, copy.copy(element)
 227                     )
 228
 229     return closed_fragments, open_fragments
 230
 231
 232 def add_anchor(element, prefix, with_link=True, with_target=True,
 233                link_text=None):
 234     parent = element.getparent()
 235     index = parent.index(element)
 236
 237     if with_link:
 238         if link_text is None:
 239             link_text = prefix
 240         anchor = etree.Element('a', href='#%s' % prefix)
 241         anchor.set('class', 'anchor')
 242         anchor.text = six.text_type(link_text)
 243         parent.insert(index, anchor)
 244
 245     if with_target:
 246         anchor_target = etree.Element('a', name='%s' % prefix)
 247         anchor_target.set('class', 'target')
 248         anchor_target.text = u' '
 249         parent.insert(index, anchor_target)
 250
 251
 252 def any_ancestor(element, test):
 253     for ancestor in element.iterancestors():
 254         if test(ancestor):
 255             return True
 256     return False
 257
 258
 259 def add_anchors(root):
 260     counter = 1
 261     for element in root.iterdescendants():
 262         def f(e):
 263             return (
 264                 e.get('class') in (
 265                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
 266                 )
 267                 or e.get('id') == 'nota_red'
 268                 or e.tag == 'blockquote'
 269             )
 270         if any_ancestor(element, f):
 271             continue
 272
 273         if element.tag == 'div' and 'verse' in element.get('class', ''):
 274             if counter == 1 or counter % 5 == 0:
 275                 add_anchor(element, "f%d" % counter, link_text=counter)
 276             counter += 1
 277         elif 'paragraph' in element.get('class', ''):
 278             add_anchor(element, "f%d" % counter, link_text=counter)
 279             counter += 1
 280
 281
 282 def raw_printable_text(element):
 283     working = copy.deepcopy(element)
 284     for e in working.findall('a'):
 285         if e.get('class') in ('annotation', 'theme-begin'):
 286             e.text = ''
 287     return etree.tostring(working, method='text', encoding='unicode').strip()
 288
 289
 290 def add_table_of_contents(root):
 291     sections = []
 292     counter = 1
 293     for element in root.iterdescendants():
 294         if element.tag in ('h2', 'h3'):
 295             if any_ancestor(
 296                     element,
 297                     lambda e: e.get('id') in (
 298                         'footnotes', 'nota_red'
 299                     ) or e.get('class') in ('person-list',)):
 300                 continue
 301
 302             element_text = raw_printable_text(element)
 303             if (element.tag == 'h3' and len(sections)
 304                     and sections[-1][1] == 'h2'):
 305                 sections[-1][3].append(
 306                     (counter, element.tag, element_text, [])
 307                 )
 308             else:
 309                 sections.append((counter, element.tag, element_text, []))
 310             add_anchor(element, "s%d" % counter, with_link=False)
 311             counter += 1
 312
 313     toc = etree.Element('div')
 314     toc.set('id', 'toc')
 315     toc_header = etree.SubElement(toc, 'h2')
 316     toc_header.text = u'Spis treści'
 317     toc_list = etree.SubElement(toc, 'ol')
 318
 319     for n, section, text, subsections in sections:
 320         section_element = etree.SubElement(toc_list, 'li')
 321         add_anchor(section_element, "s%d" % n, with_target=False,
 322                    link_text=text)
 323
 324         if len(subsections):
 325             subsection_list = etree.SubElement(section_element, 'ol')
 326             for n1, subsection, subtext, _ in subsections:
 327                 subsection_element = etree.SubElement(subsection_list, 'li')
 328                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
 329                            link_text=subtext)
 330
 331     root.insert(0, toc)
 332
 333
 334 def add_table_of_themes(root):
 335     try:
 336         from sortify import sortify
 337     except ImportError:
 338         def sortify(x):
 339             return x
 340
 341     book_themes = {}
 342     for fragment in root.findall('.//a[@class="theme-begin"]'):
 343         if not fragment.text:
 344             continue
 345         theme_names = [s.strip() for s in fragment.text.split(',')]
 346         for theme_name in theme_names:
 347             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 348     book_themes = list(book_themes.items())
 349     book_themes.sort(key=lambda s: sortify(s[0]))
 350     themes_div = etree.Element('div', id="themes")
 351     themes_ol = etree.SubElement(themes_div, 'ol')
 352     for theme_name, fragments in book_themes:
 353         themes_li = etree.SubElement(themes_ol, 'li')
 354         themes_li.text = "%s: " % theme_name
 355         for i, fragment in enumerate(fragments):
 356             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 357             item.text = str(i + 1)
 358             item.tail = ' '
 359     root.insert(0, themes_div)
 360
 361
 362 def extract_annotations(html_path):
 363     """Extracts annotations from HTML for annotations dictionary.
 364
 365     For each annotation, yields a tuple of:
 366     anchor, footnote type, valid qualifiers, text, html.
 367
 368     """
 369     from .fn_qualifiers import FN_QUALIFIERS
 370
 371     parser = etree.HTMLParser(encoding='utf-8')
 372     tree = etree.parse(html_path, parser)
 373     footnotes = tree.find('//*[@id="footnotes"]')
 374     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 375     if footnotes is not None:
 376         for footnote in footnotes.findall('div'):
 377             fn_type = footnote.get('class').split('-')[1]
 378             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 379             del footnote[:2]
 380             footnote.text = None
 381             if len(footnote) and footnote[-1].tail == '\n':
 382                 footnote[-1].tail = None
 383             text_str = etree.tostring(footnote, method='text',
 384                                       encoding='unicode').strip()
 385             html_str = etree.tostring(footnote, method='html',
 386                                       encoding='unicode').strip()
 387
 388             match = re_qualifier.match(text_str)
 389             if match:
 390                 qualifier_str = match.group(1)
 391                 qualifiers = []
 392                 for candidate in re.split('[;,]', qualifier_str):
 393                     candidate = candidate.strip()
 394                     if candidate in FN_QUALIFIERS:
 395                         qualifiers.append(candidate)
 396                     elif candidate.startswith('z '):
 397                         subcandidate = candidate.split()[1]
 398                         if subcandidate in FN_QUALIFIERS:
 399                             qualifiers.append(subcandidate)
 400             else:
 401                 qualifiers = []
 402
 403             yield anchor, fn_type, qualifiers, text_str, html_str