librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import re
   8 import cStringIO
   9 import copy
  10
  11 from lxml import etree
  12 from librarian import XHTMLNS, ParseError, OutputFile
  13 from librarian import functions
  14
  15 from lxml.etree import XMLSyntaxError, XSLTApplyError
  16
  17 functions.reg_substitute_entities()
  18 functions.reg_person_name()
  19
  20 STYLESHEETS = {
  21     'legacy': 'xslt/book2html.xslt',
  22     'full': 'xslt/wl2html_full.xslt',
  23     'partial': 'xslt/wl2html_partial.xslt'
  24 }
  25
  26
  27 def get_stylesheet(name):
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  29
  30
  31 def html_has_content(text):
  32     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  33
  34
  35 def transform_abstrakt(abstrakt_element):
  36     from cStringIO import StringIO
  37     style_filename = get_stylesheet('legacy')
  38     style = etree.parse(style_filename)
  39     xml = etree.tostring(abstrakt_element)
  40     document = etree.parse(StringIO(xml.replace('abstrakt', 'dlugi_cytat')))  # HACK
  41     result = document.xslt(style)
  42     html = re.sub('<a name="sec[0-9]*"/>', '', etree.tostring(result))
  43     return re.sub('</?blockquote[^>]*>', '', html)
  44
  45
  46 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
  47     """Transforms the WL document to XHTML.
  48
  49     If output_filename is None, returns an XML,
  50     otherwise returns True if file has been written,False if it hasn't.
  51     File won't be written if it has no content.
  52     """
  53     # Parse XSLT
  54     try:
  55         style_filename = get_stylesheet(stylesheet)
  56         style = etree.parse(style_filename)
  57
  58         document = copy.deepcopy(wldoc)
  59         del wldoc
  60         document.swap_endlines()
  61
  62         if flags:
  63             for flag in flags:
  64                 document.edoc.getroot().set(flag, 'yes')
  65
  66         document.clean_ed_note()
  67         document.clean_ed_note('abstrakt')
  68
  69         if not options:
  70             options = {}
  71         options.setdefault('gallery', "''")
  72         result = document.transform(style, **options)
  73         del document  # no longer needed large object :)
  74
  75         if html_has_content(result):
  76             add_anchors(result.getroot())
  77             add_table_of_themes(result.getroot())
  78             add_table_of_contents(result.getroot())
  79
  80             return OutputFile.from_string(etree.tostring(
  81                 result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
  82         else:
  83             return None
  84     except KeyError:
  85         raise ValueError("'%s' is not a valid stylesheet.")
  86     except (XMLSyntaxError, XSLTApplyError), e:
  87         raise ParseError(e)
  88
  89
  90 class Fragment(object):
  91     def __init__(self, id, themes):
  92         super(Fragment, self).__init__()
  93         self.id = id
  94         self.themes = themes
  95         self.events = []
  96
  97     def append(self, event, element):
  98         self.events.append((event, element))
  99
 100     def closed_events(self):
 101         stack = []
 102         for event, element in self.events:
 103             if event == 'start':
 104                 stack.append(('end', element))
 105             elif event == 'end':
 106                 try:
 107                     stack.pop()
 108                 except IndexError:
 109                     print 'CLOSED NON-OPEN TAG:', element
 110
 111         stack.reverse()
 112         return self.events + stack
 113
 114     def to_string(self):
 115         result = []
 116         for event, element in self.closed_events():
 117             if event == 'start':
 118                 result.append(u'<%s %s>' % (
 119                     element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 120                 if element.text:
 121                     result.append(element.text)
 122             elif event == 'end':
 123                 result.append(u'</%s>' % element.tag)
 124                 if element.tail:
 125                     result.append(element.tail)
 126             else:
 127                 result.append(element)
 128
 129         return ''.join(result)
 130
 131     def __unicode__(self):
 132         return self.to_string()
 133
 134
 135 def extract_fragments(input_filename):
 136     """Extracts theme fragments from input_filename."""
 137     open_fragments = {}
 138     closed_fragments = {}
 139
 140     # iterparse would die on a HTML document
 141     parser = etree.HTMLParser(encoding='utf-8')
 142     buf = cStringIO.StringIO()
 143     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 144     buf.seek(0)
 145
 146     for event, element in etree.iterparse(buf, events=('start', 'end')):
 147         # Process begin and end elements
 148         if element.get('class', '') in ('theme-begin', 'theme-end'):
 149             if not event == 'end':
 150                 continue  # Process elements only once, on end event
 151
 152             # Open new fragment
 153             if element.get('class', '') == 'theme-begin':
 154                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 155
 156                 # Append parents
 157                 parent = element.getparent()
 158                 parents = []
 159                 while parent.get('id', None) != 'book-text':
 160                     cparent = copy.deepcopy(parent)
 161                     cparent.text = None
 162                     parents.append(cparent)
 163                     parent = parent.getparent()
 164
 165                 parents.reverse()
 166                 for parent in parents:
 167                     fragment.append('start', parent)
 168
 169                 open_fragments[fragment.id] = fragment
 170
 171             # Close existing fragment
 172             else:
 173                 try:
 174                     fragment = open_fragments[element.get('fid')]
 175                 except KeyError:
 176                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 177                 else:
 178                     closed_fragments[fragment.id] = fragment
 179                     del open_fragments[fragment.id]
 180
 181             # Append element tail to lost_text (we don't want to lose any text)
 182             if element.tail:
 183                 for fragment_id in open_fragments:
 184                     open_fragments[fragment_id].append('text', element.tail)
 185
 186         # Process all elements except begin and end
 187         else:
 188             # Omit annotation tags
 189             if (len(element.get('name', '')) or
 190                     element.get('class', '') in ('annotation', 'anchor')):
 191                 if event == 'end' and element.tail:
 192                     for fragment_id in open_fragments:
 193                         open_fragments[fragment_id].append('text', element.tail)
 194             else:
 195                 for fragment_id in open_fragments:
 196                     open_fragments[fragment_id].append(event, copy.copy(element))
 197
 198     return closed_fragments, open_fragments
 199
 200
 201 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 202     parent = element.getparent()
 203     index = parent.index(element)
 204
 205     if with_link:
 206         if link_text is None:
 207             link_text = prefix
 208         anchor = etree.Element('a', href='#%s' % prefix)
 209         anchor.set('class', 'anchor')
 210         anchor.text = unicode(link_text)
 211         parent.insert(index, anchor)
 212
 213     if with_target:
 214         anchor_target = etree.Element('a', name='%s' % prefix)
 215         anchor_target.set('class', 'target')
 216         anchor_target.text = u' '
 217         parent.insert(index, anchor_target)
 218
 219
 220 def any_ancestor(element, test):
 221     for ancestor in element.iterancestors():
 222         if test(ancestor):
 223             return True
 224     return False
 225
 226
 227 def add_anchors(root):
 228     counter = 1
 229     for element in root.iterdescendants():
 230         def f(e):
 231             return e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication', 'frame') or \
 232                 e.get('id') == 'nota_red' or e.tag == 'blockquote'
 233         if any_ancestor(element, f):
 234             continue
 235
 236         if element.tag == 'p' and 'verse' in element.get('class', ''):
 237             if counter == 1 or counter % 5 == 0:
 238                 add_anchor(element, "f%d" % counter, link_text=counter)
 239             counter += 1
 240         elif 'paragraph' in element.get('class', ''):
 241             add_anchor(element, "f%d" % counter, link_text=counter)
 242             counter += 1
 243
 244
 245 def raw_printable_text(element):
 246     working = copy.deepcopy(element)
 247     for e in working.findall('a'):
 248         if e.get('class') in ('annotation', 'theme-begin'):
 249             e.text = ''
 250     return etree.tostring(working, method='text', encoding=unicode).strip()
 251
 252
 253 def add_table_of_contents(root):
 254     sections = []
 255     counter = 1
 256     for element in root.iterdescendants():
 257         if element.tag in ('h2', 'h3'):
 258             if any_ancestor(element,
 259                             lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
 260                 continue
 261
 262             element_text = raw_printable_text(element)
 263             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 264                 sections[-1][3].append((counter, element.tag, element_text, []))
 265             else:
 266                 sections.append((counter, element.tag, element_text, []))
 267             add_anchor(element, "s%d" % counter, with_link=False)
 268             counter += 1
 269
 270     toc = etree.Element('div')
 271     toc.set('id', 'toc')
 272     toc_header = etree.SubElement(toc, 'h2')
 273     toc_header.text = u'Spis treści'
 274     toc_list = etree.SubElement(toc, 'ol')
 275
 276     for n, section, text, subsections in sections:
 277         section_element = etree.SubElement(toc_list, 'li')
 278         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 279
 280         if len(subsections):
 281             subsection_list = etree.SubElement(section_element, 'ol')
 282             for n1, subsection, subtext, _ in subsections:
 283                 subsection_element = etree.SubElement(subsection_list, 'li')
 284                 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=subtext)
 285
 286     root.insert(0, toc)
 287
 288
 289 def add_table_of_themes(root):
 290     try:
 291         from sortify import sortify
 292     except ImportError:
 293         def sortify(x):
 294             return x
 295
 296     book_themes = {}
 297     for fragment in root.findall('.//a[@class="theme-begin"]'):
 298         if not fragment.text:
 299             continue
 300         theme_names = [s.strip() for s in fragment.text.split(',')]
 301         for theme_name in theme_names:
 302             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 303     book_themes = book_themes.items()
 304     book_themes.sort(key=lambda s: sortify(s[0]))
 305     themes_div = etree.Element('div', id="themes")
 306     themes_ol = etree.SubElement(themes_div, 'ol')
 307     for theme_name, fragments in book_themes:
 308         themes_li = etree.SubElement(themes_ol, 'li')
 309         themes_li.text = "%s: " % theme_name
 310         for i, fragment in enumerate(fragments):
 311             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 312             item.text = str(i + 1)
 313             item.tail = ' '
 314     root.insert(0, themes_div)
 315
 316
 317 def extract_annotations(html_path):
 318     """Extracts annotations from HTML for annotations dictionary.
 319
 320     For each annotation, yields a tuple of:
 321     anchor, footnote type, valid qualifiers, text, html.
 322
 323     """
 324     from .fn_qualifiers import FN_QUALIFIERS
 325
 326     parser = etree.HTMLParser(encoding='utf-8')
 327     tree = etree.parse(html_path, parser)
 328     footnotes = tree.find('//*[@id="footnotes"]')
 329     re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 330     if footnotes is not None:
 331         for footnote in footnotes.findall('div'):
 332             fn_type = footnote.get('class').split('-')[1]
 333             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 334             del footnote[:2]
 335             footnote.text = None
 336             if len(footnote) and footnote[-1].tail == '\n':
 337                 footnote[-1].tail = None
 338             text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
 339             html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
 340
 341             match = re_qualifier.match(text_str)
 342             if match:
 343                 qualifier_str = match.group(1)
 344                 qualifiers = []
 345                 for candidate in re.split('[;,]', qualifier_str):
 346                     candidate = candidate.strip()
 347                     if candidate in FN_QUALIFIERS:
 348                         qualifiers.append(candidate)
 349                     elif candidate.startswith('z '):
 350                         subcandidate = candidate.split()[1]
 351                         if subcandidate in FN_QUALIFIERS:
 352                             qualifiers.append(subcandidate)
 353             else:
 354                 qualifiers = []
 355
 356             yield anchor, fn_type, qualifiers, text_str, html_str