librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import re
   8 import cStringIO
   9 import copy
  10
  11 from lxml import etree
  12 from librarian import XHTMLNS, ParseError, OutputFile
  13 from librarian import functions
  14
  15 from lxml.etree import XMLSyntaxError, XSLTApplyError
  16
  17 functions.reg_substitute_entities()
  18 functions.reg_person_name()
  19
  20 STYLESHEETS = {
  21     'legacy': 'xslt/book2html.xslt',
  22     'full': 'xslt/wl2html_full.xslt',
  23     'partial': 'xslt/wl2html_partial.xslt'
  24 }
  25
  26
  27 def get_stylesheet(name):
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  29
  30
  31 def html_has_content(text):
  32     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  33
  34
  35 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
  36     """Transforms the WL document to XHTML.
  37
  38     If output_filename is None, returns an XML,
  39     otherwise returns True if file has been written,False if it hasn't.
  40     File won't be written if it has no content.
  41     """
  42     # Parse XSLT
  43     try:
  44         style_filename = get_stylesheet(stylesheet)
  45         style = etree.parse(style_filename)
  46
  47         document = copy.deepcopy(wldoc)
  48         del wldoc
  49         document.swap_endlines()
  50
  51         if flags:
  52             for flag in flags:
  53                 document.edoc.getroot().set(flag, 'yes')
  54
  55         document.clean_ed_note()
  56         document.clean_ed_note('abstrakt')
  57
  58         if not options:
  59             options = {}
  60         result = document.transform(style, **options)
  61         del document  # no longer needed large object :)
  62
  63         if html_has_content(result):
  64             add_anchors(result.getroot())
  65             add_table_of_themes(result.getroot())
  66             add_table_of_contents(result.getroot())
  67
  68             return OutputFile.from_string(etree.tostring(
  69                 result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
  70         else:
  71             return None
  72     except KeyError:
  73         raise ValueError("'%s' is not a valid stylesheet.")
  74     except (XMLSyntaxError, XSLTApplyError), e:
  75         raise ParseError(e)
  76
  77
  78 class Fragment(object):
  79     def __init__(self, id, themes):
  80         super(Fragment, self).__init__()
  81         self.id = id
  82         self.themes = themes
  83         self.events = []
  84
  85     def append(self, event, element):
  86         self.events.append((event, element))
  87
  88     def closed_events(self):
  89         stack = []
  90         for event, element in self.events:
  91             if event == 'start':
  92                 stack.append(('end', element))
  93             elif event == 'end':
  94                 try:
  95                     stack.pop()
  96                 except IndexError:
  97                     print 'CLOSED NON-OPEN TAG:', element
  98
  99         stack.reverse()
 100         return self.events + stack
 101
 102     def to_string(self):
 103         result = []
 104         for event, element in self.closed_events():
 105             if event == 'start':
 106                 result.append(u'<%s %s>' % (
 107                     element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 108                 if element.text:
 109                     result.append(element.text)
 110             elif event == 'end':
 111                 result.append(u'</%s>' % element.tag)
 112                 if element.tail:
 113                     result.append(element.tail)
 114             else:
 115                 result.append(element)
 116
 117         return ''.join(result)
 118
 119     def __unicode__(self):
 120         return self.to_string()
 121
 122
 123 def extract_fragments(input_filename):
 124     """Extracts theme fragments from input_filename."""
 125     open_fragments = {}
 126     closed_fragments = {}
 127
 128     # iterparse would die on a HTML document
 129     parser = etree.HTMLParser(encoding='utf-8')
 130     buf = cStringIO.StringIO()
 131     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 132     buf.seek(0)
 133
 134     for event, element in etree.iterparse(buf, events=('start', 'end')):
 135         # Process begin and end elements
 136         if element.get('class', '') in ('theme-begin', 'theme-end'):
 137             if not event == 'end':
 138                 continue  # Process elements only once, on end event
 139
 140             # Open new fragment
 141             if element.get('class', '') == 'theme-begin':
 142                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 143
 144                 # Append parents
 145                 parent = element.getparent()
 146                 parents = []
 147                 while parent.get('id', None) != 'book-text':
 148                     cparent = copy.deepcopy(parent)
 149                     cparent.text = None
 150                     parents.append(cparent)
 151                     parent = parent.getparent()
 152
 153                 parents.reverse()
 154                 for parent in parents:
 155                     fragment.append('start', parent)
 156
 157                 open_fragments[fragment.id] = fragment
 158
 159             # Close existing fragment
 160             else:
 161                 try:
 162                     fragment = open_fragments[element.get('fid')]
 163                 except KeyError:
 164                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 165                 else:
 166                     closed_fragments[fragment.id] = fragment
 167                     del open_fragments[fragment.id]
 168
 169             # Append element tail to lost_text (we don't want to lose any text)
 170             if element.tail:
 171                 for fragment_id in open_fragments:
 172                     open_fragments[fragment_id].append('text', element.tail)
 173
 174         # Process all elements except begin and end
 175         else:
 176             # Omit annotation tags
 177             if (len(element.get('name', '')) or
 178                     element.get('class', '') in ('annotation', 'anchor')):
 179                 if event == 'end' and element.tail:
 180                     for fragment_id in open_fragments:
 181                         open_fragments[fragment_id].append('text', element.tail)
 182             else:
 183                 for fragment_id in open_fragments:
 184                     open_fragments[fragment_id].append(event, copy.copy(element))
 185
 186     return closed_fragments, open_fragments
 187
 188
 189 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 190     parent = element.getparent()
 191     index = parent.index(element)
 192
 193     if with_link:
 194         if link_text is None:
 195             link_text = prefix
 196         anchor = etree.Element('a', href='#%s' % prefix)
 197         anchor.set('class', 'anchor')
 198         anchor.text = unicode(link_text)
 199         parent.insert(index, anchor)
 200
 201     if with_target:
 202         anchor_target = etree.Element('a', name='%s' % prefix)
 203         anchor_target.set('class', 'target')
 204         anchor_target.text = u' '
 205         parent.insert(index, anchor_target)
 206
 207
 208 def any_ancestor(element, test):
 209     for ancestor in element.iterancestors():
 210         if test(ancestor):
 211             return True
 212     return False
 213
 214
 215 def add_anchors(root):
 216     counter = 1
 217     for element in root.iterdescendants():
 218         def f(e):
 219             return e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication') or \
 220                 e.get('id') == 'nota_red' or e.tag == 'blockquote'
 221         if any_ancestor(element, f):
 222             continue
 223
 224         if element.tag == 'p' and 'verse' in element.get('class', ''):
 225             if counter == 1 or counter % 5 == 0:
 226                 add_anchor(element, "f%d" % counter, link_text=counter)
 227             counter += 1
 228         elif 'paragraph' in element.get('class', ''):
 229             add_anchor(element, "f%d" % counter, link_text=counter)
 230             counter += 1
 231
 232
 233 def raw_printable_text(element):
 234     working = copy.deepcopy(element)
 235     for e in working.findall('a'):
 236         if e.get('class') in ('annotation', 'theme-begin'):
 237             e.text = ''
 238     return etree.tostring(working, method='text', encoding=unicode).strip()
 239
 240
 241 def add_table_of_contents(root):
 242     sections = []
 243     counter = 1
 244     for element in root.iterdescendants():
 245         if element.tag in ('h2', 'h3'):
 246             if any_ancestor(element,
 247                             lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
 248                 continue
 249
 250             element_text = raw_printable_text(element)
 251             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 252                 sections[-1][3].append((counter, element.tag, element_text, []))
 253             else:
 254                 sections.append((counter, element.tag, element_text, []))
 255             add_anchor(element, "s%d" % counter, with_link=False)
 256             counter += 1
 257
 258     toc = etree.Element('div')
 259     toc.set('id', 'toc')
 260     toc_header = etree.SubElement(toc, 'h2')
 261     toc_header.text = u'Spis treści'
 262     toc_list = etree.SubElement(toc, 'ol')
 263
 264     for n, section, text, subsections in sections:
 265         section_element = etree.SubElement(toc_list, 'li')
 266         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 267
 268         if len(subsections):
 269             subsection_list = etree.SubElement(section_element, 'ol')
 270             for n1, subsection, subtext, _ in subsections:
 271                 subsection_element = etree.SubElement(subsection_list, 'li')
 272                 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=subtext)
 273
 274     root.insert(0, toc)
 275
 276
 277 def add_table_of_themes(root):
 278     try:
 279         from sortify import sortify
 280     except ImportError:
 281         def sortify(x):
 282             return x
 283
 284     book_themes = {}
 285     for fragment in root.findall('.//a[@class="theme-begin"]'):
 286         if not fragment.text:
 287             continue
 288         theme_names = [s.strip() for s in fragment.text.split(',')]
 289         for theme_name in theme_names:
 290             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 291     book_themes = book_themes.items()
 292     book_themes.sort(key=lambda s: sortify(s[0]))
 293     themes_div = etree.Element('div', id="themes")
 294     themes_ol = etree.SubElement(themes_div, 'ol')
 295     for theme_name, fragments in book_themes:
 296         themes_li = etree.SubElement(themes_ol, 'li')
 297         themes_li.text = "%s: " % theme_name
 298         for i, fragment in enumerate(fragments):
 299             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 300             item.text = str(i + 1)
 301             item.tail = ' '
 302     root.insert(0, themes_div)
 303
 304
 305 def extract_annotations(html_path):
 306     """Extracts annotations from HTML for annotations dictionary.
 307
 308     For each annotation, yields a tuple of:
 309     anchor, footnote type, valid qualifiers, text, html.
 310
 311     """
 312     from .fn_qualifiers import FN_QUALIFIERS
 313
 314     parser = etree.HTMLParser(encoding='utf-8')
 315     tree = etree.parse(html_path, parser)
 316     footnotes = tree.find('//*[@id="footnotes"]')
 317     re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 318     if footnotes is not None:
 319         for footnote in footnotes.findall('div'):
 320             fn_type = footnote.get('class').split('-')[1]
 321             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 322             del footnote[:2]
 323             footnote.text = None
 324             if len(footnote) and footnote[-1].tail == '\n':
 325                 footnote[-1].tail = None
 326             text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
 327             html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
 328
 329             match = re_qualifier.match(text_str)
 330             if match:
 331                 qualifier_str = match.group(1)
 332                 qualifiers = []
 333                 for candidate in re.split('[;,]', qualifier_str):
 334                     candidate = candidate.strip()
 335                     if candidate in FN_QUALIFIERS:
 336                         qualifiers.append(candidate)
 337                     elif candidate.startswith('z '):
 338                         subcandidate = candidate.split()[1]
 339                         if subcandidate in FN_QUALIFIERS:
 340                             qualifiers.append(subcandidate)
 341             else:
 342                 qualifiers = []
 343
 344             yield anchor, fn_type, qualifiers, text_str, html_str