librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import re
  10 import copy
  11
  12 from lxml import etree
  13 from librarian import XHTMLNS, ParseError, OutputFile
  14 from librarian import functions
  15
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
  17 import six
  18
  19
  20 functions.reg_substitute_entities()
  21 functions.reg_person_name()
  22
  23 STYLESHEETS = {
  24     'legacy': 'xslt/book2html.xslt',
  25     'full': 'xslt/wl2html_full.xslt',
  26     'partial': 'xslt/wl2html_partial.xslt'
  27 }
  28
  29
  30 def get_stylesheet(name):
  31     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  32
  33
  34 def html_has_content(text):
  35     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  36
  37
  38 def transform_abstrakt(abstrakt_element):
  39     style_filename = get_stylesheet('legacy')
  40     style = etree.parse(style_filename)
  41     xml = etree.tostring(abstrakt_element, encoding='unicode')
  42     document = etree.parse(six.StringIO(xml.replace('abstrakt', 'dlugi_cytat')))  # HACK
  43     result = document.xslt(style)
  44     html = re.sub('<a name="sec[0-9]*"/>', '', etree.tostring(result, encoding='unicode'))
  45     return re.sub('</?blockquote[^>]*>', '', html)
  46
  47
  48 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
  49     """Transforms the WL document to XHTML.
  50
  51     If output_filename is None, returns an XML,
  52     otherwise returns True if file has been written,False if it hasn't.
  53     File won't be written if it has no content.
  54     """
  55     # Parse XSLT
  56     try:
  57         style_filename = get_stylesheet(stylesheet)
  58         style = etree.parse(style_filename)
  59
  60         document = copy.deepcopy(wldoc)
  61         del wldoc
  62         document.swap_endlines()
  63
  64         if flags:
  65             for flag in flags:
  66                 document.edoc.getroot().set(flag, 'yes')
  67
  68         document.clean_ed_note()
  69         document.clean_ed_note('abstrakt')
  70
  71         if not options:
  72             options = {}
  73         options.setdefault('gallery', "''")
  74         result = document.transform(style, **options)
  75         del document  # no longer needed large object :)
  76
  77         if html_has_content(result):
  78             add_anchors(result.getroot())
  79             add_table_of_themes(result.getroot())
  80             add_table_of_contents(result.getroot())
  81
  82             return OutputFile.from_bytes(etree.tostring(
  83                 result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
  84         else:
  85             return None
  86     except KeyError:
  87         raise ValueError("'%s' is not a valid stylesheet.")
  88     except (XMLSyntaxError, XSLTApplyError) as e:
  89         raise ParseError(e)
  90
  91
  92 @six.python_2_unicode_compatible
  93 class Fragment(object):
  94     def __init__(self, id, themes):
  95         super(Fragment, self).__init__()
  96         self.id = id
  97         self.themes = themes
  98         self.events = []
  99
 100     def append(self, event, element):
 101         self.events.append((event, element))
 102
 103     def closed_events(self):
 104         stack = []
 105         for event, element in self.events:
 106             if event == 'start':
 107                 stack.append(('end', element))
 108             elif event == 'end':
 109                 try:
 110                     stack.pop()
 111                 except IndexError:
 112                     print('CLOSED NON-OPEN TAG:', element)
 113
 114         stack.reverse()
 115         return self.events + stack
 116
 117     def to_string(self):
 118         result = []
 119         for event, element in self.closed_events():
 120             if event == 'start':
 121                 result.append(u'<%s %s>' % (
 122                     element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 123                 if element.text:
 124                     result.append(element.text)
 125             elif event == 'end':
 126                 result.append(u'</%s>' % element.tag)
 127                 if element.tail:
 128                     result.append(element.tail)
 129             else:
 130                 result.append(element)
 131
 132         return ''.join(result)
 133
 134     def __str__(self):
 135         return self.to_string()
 136
 137
 138 def extract_fragments(input_filename):
 139     """Extracts theme fragments from input_filename."""
 140     open_fragments = {}
 141     closed_fragments = {}
 142
 143     # iterparse would die on a HTML document
 144     parser = etree.HTMLParser(encoding='utf-8')
 145     buf = six.BytesIO()
 146     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 147     buf.seek(0)
 148
 149     for event, element in etree.iterparse(buf, events=('start', 'end')):
 150         # Process begin and end elements
 151         if element.get('class', '') in ('theme-begin', 'theme-end'):
 152             if not event == 'end':
 153                 continue  # Process elements only once, on end event
 154
 155             # Open new fragment
 156             if element.get('class', '') == 'theme-begin':
 157                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 158
 159                 # Append parents
 160                 parent = element.getparent()
 161                 parents = []
 162                 while parent.get('id', None) != 'book-text':
 163                     cparent = copy.deepcopy(parent)
 164                     cparent.text = None
 165                     parents.append(cparent)
 166                     parent = parent.getparent()
 167
 168                 parents.reverse()
 169                 for parent in parents:
 170                     fragment.append('start', parent)
 171
 172                 open_fragments[fragment.id] = fragment
 173
 174             # Close existing fragment
 175             else:
 176                 try:
 177                     fragment = open_fragments[element.get('fid')]
 178                 except KeyError:
 179                     print('%s:closed not open fragment #%s' % (input_filename, element.get('fid')))
 180                 else:
 181                     closed_fragments[fragment.id] = fragment
 182                     del open_fragments[fragment.id]
 183
 184             # Append element tail to lost_text (we don't want to lose any text)
 185             if element.tail:
 186                 for fragment_id in open_fragments:
 187                     open_fragments[fragment_id].append('text', element.tail)
 188
 189         # Process all elements except begin and end
 190         else:
 191             # Omit annotation tags
 192             if (len(element.get('name', '')) or
 193                     element.get('class', '') in ('annotation', 'anchor')):
 194                 if event == 'end' and element.tail:
 195                     for fragment_id in open_fragments:
 196                         open_fragments[fragment_id].append('text', element.tail)
 197             else:
 198                 for fragment_id in open_fragments:
 199                     open_fragments[fragment_id].append(event, copy.copy(element))
 200
 201     return closed_fragments, open_fragments
 202
 203
 204 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 205     parent = element.getparent()
 206     index = parent.index(element)
 207
 208     if with_link:
 209         if link_text is None:
 210             link_text = prefix
 211         anchor = etree.Element('a', href='#%s' % prefix)
 212         anchor.set('class', 'anchor')
 213         anchor.text = six.text_type(link_text)
 214         parent.insert(index, anchor)
 215
 216     if with_target:
 217         anchor_target = etree.Element('a', name='%s' % prefix)
 218         anchor_target.set('class', 'target')
 219         anchor_target.text = u' '
 220         parent.insert(index, anchor_target)
 221
 222
 223 def any_ancestor(element, test):
 224     for ancestor in element.iterancestors():
 225         if test(ancestor):
 226             return True
 227     return False
 228
 229
 230 def add_anchors(root):
 231     counter = 1
 232     for element in root.iterdescendants():
 233         def f(e):
 234             return e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication', 'frame') or \
 235                 e.get('id') == 'nota_red' or e.tag == 'blockquote'
 236         if any_ancestor(element, f):
 237             continue
 238
 239         if element.tag == 'p' and 'verse' in element.get('class', ''):
 240             if counter == 1 or counter % 5 == 0:
 241                 add_anchor(element, "f%d" % counter, link_text=counter)
 242             counter += 1
 243         elif 'paragraph' in element.get('class', ''):
 244             add_anchor(element, "f%d" % counter, link_text=counter)
 245             counter += 1
 246
 247
 248 def raw_printable_text(element):
 249     working = copy.deepcopy(element)
 250     for e in working.findall('a'):
 251         if e.get('class') in ('annotation', 'theme-begin'):
 252             e.text = ''
 253     return etree.tostring(working, method='text', encoding='unicode').strip()
 254
 255
 256 def add_table_of_contents(root):
 257     sections = []
 258     counter = 1
 259     for element in root.iterdescendants():
 260         if element.tag in ('h2', 'h3'):
 261             if any_ancestor(element,
 262                             lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
 263                 continue
 264
 265             element_text = raw_printable_text(element)
 266             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 267                 sections[-1][3].append((counter, element.tag, element_text, []))
 268             else:
 269                 sections.append((counter, element.tag, element_text, []))
 270             add_anchor(element, "s%d" % counter, with_link=False)
 271             counter += 1
 272
 273     toc = etree.Element('div')
 274     toc.set('id', 'toc')
 275     toc_header = etree.SubElement(toc, 'h2')
 276     toc_header.text = u'Spis treści'
 277     toc_list = etree.SubElement(toc, 'ol')
 278
 279     for n, section, text, subsections in sections:
 280         section_element = etree.SubElement(toc_list, 'li')
 281         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 282
 283         if len(subsections):
 284             subsection_list = etree.SubElement(section_element, 'ol')
 285             for n1, subsection, subtext, _ in subsections:
 286                 subsection_element = etree.SubElement(subsection_list, 'li')
 287                 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=subtext)
 288
 289     root.insert(0, toc)
 290
 291
 292 def add_table_of_themes(root):
 293     try:
 294         from sortify import sortify
 295     except ImportError:
 296         def sortify(x):
 297             return x
 298
 299     book_themes = {}
 300     for fragment in root.findall('.//a[@class="theme-begin"]'):
 301         if not fragment.text:
 302             continue
 303         theme_names = [s.strip() for s in fragment.text.split(',')]
 304         for theme_name in theme_names:
 305             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 306     book_themes = list(book_themes.items())
 307     book_themes.sort(key=lambda s: sortify(s[0]))
 308     themes_div = etree.Element('div', id="themes")
 309     themes_ol = etree.SubElement(themes_div, 'ol')
 310     for theme_name, fragments in book_themes:
 311         themes_li = etree.SubElement(themes_ol, 'li')
 312         themes_li.text = "%s: " % theme_name
 313         for i, fragment in enumerate(fragments):
 314             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 315             item.text = str(i + 1)
 316             item.tail = ' '
 317     root.insert(0, themes_div)
 318
 319
 320 def extract_annotations(html_path):
 321     """Extracts annotations from HTML for annotations dictionary.
 322
 323     For each annotation, yields a tuple of:
 324     anchor, footnote type, valid qualifiers, text, html.
 325
 326     """
 327     from .fn_qualifiers import FN_QUALIFIERS
 328
 329     parser = etree.HTMLParser(encoding='utf-8')
 330     tree = etree.parse(html_path, parser)
 331     footnotes = tree.find('//*[@id="footnotes"]')
 332     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 333     if footnotes is not None:
 334         for footnote in footnotes.findall('div'):
 335             fn_type = footnote.get('class').split('-')[1]
 336             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 337             del footnote[:2]
 338             footnote.text = None
 339             if len(footnote) and footnote[-1].tail == '\n':
 340                 footnote[-1].tail = None
 341             text_str = etree.tostring(footnote, method='text', encoding='unicode').strip()
 342             html_str = etree.tostring(footnote, method='html', encoding='unicode').strip()
 343
 344             match = re_qualifier.match(text_str)
 345             if match:
 346                 qualifier_str = match.group(1)
 347                 qualifiers = []
 348                 for candidate in re.split('[;,]', qualifier_str):
 349                     candidate = candidate.strip()
 350                     if candidate in FN_QUALIFIERS:
 351                         qualifiers.append(candidate)
 352                     elif candidate.startswith('z '):
 353                         subcandidate = candidate.split()[1]
 354                         if subcandidate in FN_QUALIFIERS:
 355                             qualifiers.append(subcandidate)
 356             else:
 357                 qualifiers = []
 358
 359             yield anchor, fn_type, qualifiers, text_str, html_str