librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import re
   8 import cStringIO
   9 import copy
  10
  11 from lxml import etree
  12 from librarian import XHTMLNS, ParseError, OutputFile
  13 from librarian import functions
  14
  15 from lxml.etree import XMLSyntaxError, XSLTApplyError
  16
  17 functions.reg_substitute_entities()
  18 functions.reg_person_name()
  19
  20 STYLESHEETS = {
  21     'legacy': 'xslt/book2html.xslt',
  22     'full': 'xslt/wl2html_full.xslt',
  23     'partial': 'xslt/wl2html_partial.xslt'
  24 }
  25
  26
  27 def get_stylesheet(name):
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  29
  30
  31 def html_has_content(text):
  32     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  33
  34
  35 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
  36     """Transforms the WL document to XHTML.
  37
  38     If output_filename is None, returns an XML,
  39     otherwise returns True if file has been written,False if it hasn't.
  40     File won't be written if it has no content.
  41     """
  42     # Parse XSLT
  43     try:
  44         style_filename = get_stylesheet(stylesheet)
  45         style = etree.parse(style_filename)
  46
  47         document = copy.deepcopy(wldoc)
  48         del wldoc
  49         document.swap_endlines()
  50
  51         if flags:
  52             for flag in flags:
  53                 document.edoc.getroot().set(flag, 'yes')
  54
  55         document.clean_ed_note()
  56         document.clean_ed_note('abstrakt')
  57
  58         if not options:
  59             options = {}
  60         options.setdefault('gallery', "''")
  61         result = document.transform(style, **options)
  62         del document  # no longer needed large object :)
  63
  64         if html_has_content(result):
  65             add_anchors(result.getroot())
  66             add_table_of_themes(result.getroot())
  67             add_table_of_contents(result.getroot())
  68
  69             return OutputFile.from_string(etree.tostring(
  70                 result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
  71         else:
  72             return None
  73     except KeyError:
  74         raise ValueError("'%s' is not a valid stylesheet.")
  75     except (XMLSyntaxError, XSLTApplyError), e:
  76         raise ParseError(e)
  77
  78
  79 class Fragment(object):
  80     def __init__(self, id, themes):
  81         super(Fragment, self).__init__()
  82         self.id = id
  83         self.themes = themes
  84         self.events = []
  85
  86     def append(self, event, element):
  87         self.events.append((event, element))
  88
  89     def closed_events(self):
  90         stack = []
  91         for event, element in self.events:
  92             if event == 'start':
  93                 stack.append(('end', element))
  94             elif event == 'end':
  95                 try:
  96                     stack.pop()
  97                 except IndexError:
  98                     print 'CLOSED NON-OPEN TAG:', element
  99
 100         stack.reverse()
 101         return self.events + stack
 102
 103     def to_string(self):
 104         result = []
 105         for event, element in self.closed_events():
 106             if event == 'start':
 107                 result.append(u'<%s %s>' % (
 108                     element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 109                 if element.text:
 110                     result.append(element.text)
 111             elif event == 'end':
 112                 result.append(u'</%s>' % element.tag)
 113                 if element.tail:
 114                     result.append(element.tail)
 115             else:
 116                 result.append(element)
 117
 118         return ''.join(result)
 119
 120     def __unicode__(self):
 121         return self.to_string()
 122
 123
 124 def extract_fragments(input_filename):
 125     """Extracts theme fragments from input_filename."""
 126     open_fragments = {}
 127     closed_fragments = {}
 128
 129     # iterparse would die on a HTML document
 130     parser = etree.HTMLParser(encoding='utf-8')
 131     buf = cStringIO.StringIO()
 132     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 133     buf.seek(0)
 134
 135     for event, element in etree.iterparse(buf, events=('start', 'end')):
 136         # Process begin and end elements
 137         if element.get('class', '') in ('theme-begin', 'theme-end'):
 138             if not event == 'end':
 139                 continue  # Process elements only once, on end event
 140
 141             # Open new fragment
 142             if element.get('class', '') == 'theme-begin':
 143                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 144
 145                 # Append parents
 146                 parent = element.getparent()
 147                 parents = []
 148                 while parent.get('id', None) != 'book-text':
 149                     cparent = copy.deepcopy(parent)
 150                     cparent.text = None
 151                     parents.append(cparent)
 152                     parent = parent.getparent()
 153
 154                 parents.reverse()
 155                 for parent in parents:
 156                     fragment.append('start', parent)
 157
 158                 open_fragments[fragment.id] = fragment
 159
 160             # Close existing fragment
 161             else:
 162                 try:
 163                     fragment = open_fragments[element.get('fid')]
 164                 except KeyError:
 165                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 166                 else:
 167                     closed_fragments[fragment.id] = fragment
 168                     del open_fragments[fragment.id]
 169
 170             # Append element tail to lost_text (we don't want to lose any text)
 171             if element.tail:
 172                 for fragment_id in open_fragments:
 173                     open_fragments[fragment_id].append('text', element.tail)
 174
 175         # Process all elements except begin and end
 176         else:
 177             # Omit annotation tags
 178             if (len(element.get('name', '')) or
 179                     element.get('class', '') in ('annotation', 'anchor')):
 180                 if event == 'end' and element.tail:
 181                     for fragment_id in open_fragments:
 182                         open_fragments[fragment_id].append('text', element.tail)
 183             else:
 184                 for fragment_id in open_fragments:
 185                     open_fragments[fragment_id].append(event, copy.copy(element))
 186
 187     return closed_fragments, open_fragments
 188
 189
 190 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 191     parent = element.getparent()
 192     index = parent.index(element)
 193
 194     if with_link:
 195         if link_text is None:
 196             link_text = prefix
 197         anchor = etree.Element('a', href='#%s' % prefix)
 198         anchor.set('class', 'anchor')
 199         anchor.text = unicode(link_text)
 200         parent.insert(index, anchor)
 201
 202     if with_target:
 203         anchor_target = etree.Element('a', name='%s' % prefix)
 204         anchor_target.set('class', 'target')
 205         anchor_target.text = u' '
 206         parent.insert(index, anchor_target)
 207
 208
 209 def any_ancestor(element, test):
 210     for ancestor in element.iterancestors():
 211         if test(ancestor):
 212             return True
 213     return False
 214
 215
 216 def add_anchors(root):
 217     counter = 1
 218     for element in root.iterdescendants():
 219         def f(e):
 220             return e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication') or \
 221                 e.get('id') == 'nota_red' or e.tag == 'blockquote'
 222         if any_ancestor(element, f):
 223             continue
 224
 225         if element.tag == 'p' and 'verse' in element.get('class', ''):
 226             if counter == 1 or counter % 5 == 0:
 227                 add_anchor(element, "f%d" % counter, link_text=counter)
 228             counter += 1
 229         elif 'paragraph' in element.get('class', ''):
 230             add_anchor(element, "f%d" % counter, link_text=counter)
 231             counter += 1
 232
 233
 234 def raw_printable_text(element):
 235     working = copy.deepcopy(element)
 236     for e in working.findall('a'):
 237         if e.get('class') in ('annotation', 'theme-begin'):
 238             e.text = ''
 239     return etree.tostring(working, method='text', encoding=unicode).strip()
 240
 241
 242 def add_table_of_contents(root):
 243     sections = []
 244     counter = 1
 245     for element in root.iterdescendants():
 246         if element.tag in ('h2', 'h3'):
 247             if any_ancestor(element,
 248                             lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
 249                 continue
 250
 251             element_text = raw_printable_text(element)
 252             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 253                 sections[-1][3].append((counter, element.tag, element_text, []))
 254             else:
 255                 sections.append((counter, element.tag, element_text, []))
 256             add_anchor(element, "s%d" % counter, with_link=False)
 257             counter += 1
 258
 259     toc = etree.Element('div')
 260     toc.set('id', 'toc')
 261     toc_header = etree.SubElement(toc, 'h2')
 262     toc_header.text = u'Spis treści'
 263     toc_list = etree.SubElement(toc, 'ol')
 264
 265     for n, section, text, subsections in sections:
 266         section_element = etree.SubElement(toc_list, 'li')
 267         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 268
 269         if len(subsections):
 270             subsection_list = etree.SubElement(section_element, 'ol')
 271             for n1, subsection, subtext, _ in subsections:
 272                 subsection_element = etree.SubElement(subsection_list, 'li')
 273                 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=subtext)
 274
 275     root.insert(0, toc)
 276
 277
 278 def add_table_of_themes(root):
 279     try:
 280         from sortify import sortify
 281     except ImportError:
 282         def sortify(x):
 283             return x
 284
 285     book_themes = {}
 286     for fragment in root.findall('.//a[@class="theme-begin"]'):
 287         if not fragment.text:
 288             continue
 289         theme_names = [s.strip() for s in fragment.text.split(',')]
 290         for theme_name in theme_names:
 291             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 292     book_themes = book_themes.items()
 293     book_themes.sort(key=lambda s: sortify(s[0]))
 294     themes_div = etree.Element('div', id="themes")
 295     themes_ol = etree.SubElement(themes_div, 'ol')
 296     for theme_name, fragments in book_themes:
 297         themes_li = etree.SubElement(themes_ol, 'li')
 298         themes_li.text = "%s: " % theme_name
 299         for i, fragment in enumerate(fragments):
 300             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 301             item.text = str(i + 1)
 302             item.tail = ' '
 303     root.insert(0, themes_div)
 304
 305
 306 def extract_annotations(html_path):
 307     """Extracts annotations from HTML for annotations dictionary.
 308
 309     For each annotation, yields a tuple of:
 310     anchor, footnote type, valid qualifiers, text, html.
 311
 312     """
 313     from .fn_qualifiers import FN_QUALIFIERS
 314
 315     parser = etree.HTMLParser(encoding='utf-8')
 316     tree = etree.parse(html_path, parser)
 317     footnotes = tree.find('//*[@id="footnotes"]')
 318     re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 319     if footnotes is not None:
 320         for footnote in footnotes.findall('div'):
 321             fn_type = footnote.get('class').split('-')[1]
 322             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 323             del footnote[:2]
 324             footnote.text = None
 325             if len(footnote) and footnote[-1].tail == '\n':
 326                 footnote[-1].tail = None
 327             text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
 328             html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
 329
 330             match = re_qualifier.match(text_str)
 331             if match:
 332                 qualifier_str = match.group(1)
 333                 qualifiers = []
 334                 for candidate in re.split('[;,]', qualifier_str):
 335                     candidate = candidate.strip()
 336                     if candidate in FN_QUALIFIERS:
 337                         qualifiers.append(candidate)
 338                     elif candidate.startswith('z '):
 339                         subcandidate = candidate.split()[1]
 340                         if subcandidate in FN_QUALIFIERS:
 341                             qualifiers.append(subcandidate)
 342             else:
 343                 qualifiers = []
 344
 345             yield anchor, fn_type, qualifiers, text_str, html_str