librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import re
   8 import cStringIO
   9 import copy
  10
  11 from lxml import etree
  12 from librarian import XHTMLNS, ParseError, OutputFile
  13 from librarian import functions
  14
  15 from lxml.etree import XMLSyntaxError, XSLTApplyError
  16
  17 functions.reg_substitute_entities()
  18 functions.reg_person_name()
  19
  20 STYLESHEETS = {
  21     'legacy': 'xslt/book2html.xslt',
  22     'full': 'xslt/wl2html_full.xslt',
  23     'partial': 'xslt/wl2html_partial.xslt'
  24 }
  25
  26 def get_stylesheet(name):
  27     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  28
  29 def html_has_content(text):
  30     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  31
  32 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
  33     """Transforms the WL document to XHTML.
  34
  35     If output_filename is None, returns an XML,
  36     otherwise returns True if file has been written,False if it hasn't.
  37     File won't be written if it has no content.
  38     """
  39     # Parse XSLT
  40     try:
  41         style_filename = get_stylesheet(stylesheet)
  42         style = etree.parse(style_filename)
  43
  44         document = copy.deepcopy(wldoc)
  45         del wldoc
  46         document.swap_endlines()
  47
  48         if flags:
  49             for flag in flags:
  50                 document.edoc.getroot().set(flag, 'yes')
  51
  52         document.clean_ed_note()
  53
  54         if not options:
  55             options = {}
  56         result = document.transform(style, **options)
  57         del document # no longer needed large object :)
  58
  59         if html_has_content(result):
  60             add_anchors(result.getroot())
  61             add_table_of_themes(result.getroot())
  62             add_table_of_contents(result.getroot())
  63
  64             return OutputFile.from_string(etree.tostring(result, method='html',
  65                 xml_declaration=False, pretty_print=True, encoding='utf-8'))
  66         else:
  67             return None
  68     except KeyError:
  69         raise ValueError("'%s' is not a valid stylesheet.")
  70     except (XMLSyntaxError, XSLTApplyError), e:
  71         raise ParseError(e)
  72
  73 class Fragment(object):
  74     def __init__(self, id, themes):
  75         super(Fragment, self).__init__()
  76         self.id = id
  77         self.themes = themes
  78         self.events = []
  79
  80     def append(self, event, element):
  81         self.events.append((event, element))
  82
  83     def closed_events(self):
  84         stack = []
  85         for event, element in self.events:
  86             if event == 'start':
  87                 stack.append(('end', element))
  88             elif event == 'end':
  89                 try:
  90                     stack.pop()
  91                 except IndexError:
  92                     print 'CLOSED NON-OPEN TAG:', element
  93
  94         stack.reverse()
  95         return self.events + stack
  96
  97     def to_string(self):
  98         result = []
  99         for event, element in self.closed_events():
 100             if event == 'start':
 101                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 102                 if element.text:
 103                     result.append(element.text)
 104             elif event == 'end':
 105                 result.append(u'</%s>' % element.tag)
 106                 if element.tail:
 107                     result.append(element.tail)
 108             else:
 109                 result.append(element)
 110
 111         return ''.join(result)
 112
 113     def __unicode__(self):
 114         return self.to_string()
 115
 116
 117 def extract_fragments(input_filename):
 118     """Extracts theme fragments from input_filename."""
 119     open_fragments = {}
 120     closed_fragments = {}
 121
 122     # iterparse would die on a HTML document
 123     parser = etree.HTMLParser(encoding='utf-8')
 124     buf = cStringIO.StringIO()
 125     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 126     buf.seek(0)
 127
 128     for event, element in etree.iterparse(buf, events=('start', 'end')):
 129         # Process begin and end elements
 130         if element.get('class', '') in ('theme-begin', 'theme-end'):
 131             if not event == 'end': continue # Process elements only once, on end event
 132
 133             # Open new fragment
 134             if element.get('class', '') == 'theme-begin':
 135                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 136
 137                 # Append parents
 138                 parent = element.getparent()
 139                 parents = []
 140                 while parent.get('id', None) != 'book-text':
 141                     cparent = copy.deepcopy(parent)
 142                     cparent.text = None
 143                     parents.append(cparent)
 144                     parent = parent.getparent()
 145
 146                 parents.reverse()
 147                 for parent in parents:
 148                     fragment.append('start', parent)
 149
 150                 open_fragments[fragment.id] = fragment
 151
 152             # Close existing fragment
 153             else:
 154                 try:
 155                     fragment = open_fragments[element.get('fid')]
 156                 except KeyError:
 157                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 158                 else:
 159                     closed_fragments[fragment.id] = fragment
 160                     del open_fragments[fragment.id]
 161
 162             # Append element tail to lost_text (we don't want to lose any text)
 163             if element.tail:
 164                 for fragment_id in open_fragments:
 165                     open_fragments[fragment_id].append('text', element.tail)
 166
 167
 168         # Process all elements except begin and end
 169         else:
 170             # Omit annotation tags
 171             if (len(element.get('name', '')) or
 172                     element.get('class', '') in ('annotation', 'anchor')):
 173                 if event == 'end' and element.tail:
 174                     for fragment_id in open_fragments:
 175                         open_fragments[fragment_id].append('text', element.tail)
 176             else:
 177                 for fragment_id in open_fragments:
 178                     open_fragments[fragment_id].append(event, copy.copy(element))
 179
 180     return closed_fragments, open_fragments
 181
 182
 183 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 184     parent = element.getparent()
 185     index = parent.index(element)
 186
 187     if with_link:
 188         if link_text is None:
 189             link_text = prefix
 190         anchor = etree.Element('a', href='#%s' % prefix)
 191         anchor.set('class', 'anchor')
 192         anchor.text = unicode(link_text)
 193         parent.insert(index, anchor)
 194
 195     if with_target:
 196         anchor_target = etree.Element('a', name='%s' % prefix)
 197         anchor_target.set('class', 'target')
 198         anchor_target.text = u' '
 199         parent.insert(index, anchor_target)
 200
 201
 202 def any_ancestor(element, test):
 203     for ancestor in element.iterancestors():
 204         if test(ancestor):
 205             return True
 206     return False
 207
 208
 209 def add_anchors(root):
 210     counter = 1
 211     for element in root.iterdescendants():
 212         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 213         or e.get('id') == 'nota_red'
 214         or e.tag == 'blockquote'):
 215             continue
 216
 217         if element.tag == 'p' and 'verse' in element.get('class', ''):
 218             if counter == 1 or counter % 5 == 0:
 219                 add_anchor(element, "f%d" % counter, link_text=counter)
 220             counter += 1
 221         elif 'paragraph' in element.get('class', ''):
 222             add_anchor(element, "f%d" % counter, link_text=counter)
 223             counter += 1
 224
 225
 226 def raw_printable_text(element):
 227     working = copy.deepcopy(element)
 228     for e in working.findall('a'):
 229         if e.get('class') in ('annotation', 'theme-begin'):
 230             e.text = ''
 231     return etree.tostring(working, method='text', encoding=unicode).strip()
 232
 233
 234 def add_table_of_contents(root):
 235     sections = []
 236     counter = 1
 237     for element in root.iterdescendants():
 238         if element.tag in ('h2', 'h3'):
 239             if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
 240                 continue
 241
 242             element_text = raw_printable_text(element)
 243             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 244                 sections[-1][3].append((counter, element.tag, element_text, []))
 245             else:
 246                 sections.append((counter, element.tag, element_text, []))
 247             add_anchor(element, "s%d" % counter, with_link=False)
 248             counter += 1
 249
 250     toc = etree.Element('div')
 251     toc.set('id', 'toc')
 252     toc_header = etree.SubElement(toc, 'h2')
 253     toc_header.text = u'Spis treści'
 254     toc_list = etree.SubElement(toc, 'ol')
 255
 256     for n, section, text, subsections in sections:
 257         section_element = etree.SubElement(toc_list, 'li')
 258         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 259
 260         if len(subsections):
 261             subsection_list = etree.SubElement(section_element, 'ol')
 262             for n, subsection, text, _ in subsections:
 263                 subsection_element = etree.SubElement(subsection_list, 'li')
 264                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 265
 266     root.insert(0, toc)
 267
 268
 269 def add_table_of_themes(root):
 270     try:
 271         from sortify import sortify
 272     except ImportError:
 273         sortify = lambda x: x
 274
 275     book_themes = {}
 276     for fragment in root.findall('.//a[@class="theme-begin"]'):
 277         if not fragment.text:
 278             continue
 279         theme_names = [s.strip() for s in fragment.text.split(',')]
 280         for theme_name in theme_names:
 281             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 282     book_themes = book_themes.items()
 283     book_themes.sort(key=lambda s: sortify(s[0]))
 284     themes_div = etree.Element('div', id="themes")
 285     themes_ol = etree.SubElement(themes_div, 'ol')
 286     for theme_name, fragments in book_themes:
 287         themes_li = etree.SubElement(themes_ol, 'li')
 288         themes_li.text = "%s: " % theme_name
 289         for i, fragment in enumerate(fragments):
 290             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 291             item.text = str(i + 1)
 292             item.tail = ' '
 293     root.insert(0, themes_div)
 294
 295
 296 def extract_annotations(html_path):
 297     """For each annotation, yields a tuple: anchor, text, html."""
 298     parser = etree.HTMLParser(encoding='utf-8')
 299     tree = etree.parse(html_path, parser)
 300     footnotes = tree.find('//*[@id="footnotes"]')
 301     re_qualifier = re.compile(ur'[^\u2014]+\s+\((.+)\)\s+\u2014')
 302     if footnotes is not None:
 303         for footnote in footnotes.findall('div'):
 304             fn_type = footnote.get('class').split('-')[1]
 305             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 306             del footnote[:2]
 307             footnote.text = None
 308             if len(footnote) and footnote[-1].tail == '\n':
 309                 footnote[-1].tail = None
 310             text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
 311             html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
 312             qualifier = None
 313             match = re_qualifier.match(text_str)
 314             if match:
 315                 qualifier = match.group(1)
 316
 317             yield anchor, fn_type, qualifier, text_str, html_str
 318