librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import re
   9 import copy
  10
  11 from lxml import etree
  12 from librarian.parser import WLDocument
  13 from librarian import XHTMLNS, ParseError
  14
  15 from lxml.etree import XMLSyntaxError, XSLTApplyError
  16
  17 ENTITY_SUBSTITUTIONS = [
  18     (u'---', u'—'),
  19     (u'--', u'–'),
  20     (u'...', u'…'),
  21     (u',,', u'„'),
  22     (u'"', u'”'),
  23 ]
  24
  25 STYLESHEETS = {
  26     'legacy': 'xslt/book2html.xslt',
  27     'full': 'xslt/wl2html_full.xslt',
  28     'partial': 'xslt/wl2html_partial.xslt'
  29 }
  30
  31 def get_stylesheet(name):
  32     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  33
  34 def substitute_entities(context, text):
  35     """XPath extension function converting all entites in passed text."""
  36     if isinstance(text, list):
  37         text = ''.join(text)
  38     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  39         text = text.replace(entity, substitutution)
  40     return text
  41
  42 # Register substitute_entities function with lxml
  43 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  44 ns['substitute_entities'] = substitute_entities
  45
  46 def html_has_content(text):
  47     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  48
  49 def transform(input, output_filename=None, is_file=True, \
  50     parse_dublincore=True, stylesheet='legacy', options={}):
  51     """Transforms file input_filename in XML to output_filename in XHTML.
  52
  53     If output_filename is None, returns an XML,
  54     otherwise returns True if file has been written,False if it hasn't.
  55     File won't be written if it has no content.
  56     """
  57     # Parse XSLT
  58     try:
  59         style_filename = get_stylesheet(stylesheet)
  60         style = etree.parse(style_filename)
  61
  62         if is_file:
  63             document = WLDocument.from_file(input, True, \
  64                 parse_dublincore=parse_dublincore)
  65         else:
  66             document = WLDocument.from_string(input, True, \
  67                 parse_dublincore=parse_dublincore)
  68
  69         result = document.transform(style, **options)
  70         del document # no longer needed large object :)
  71
  72         if html_has_content(result):
  73             add_anchors(result.getroot())
  74             add_table_of_contents(result.getroot())
  75
  76             if output_filename is not None:
  77                 result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
  78             else:
  79                 return result
  80             return True
  81         else:
  82             if output_filename is not None:
  83                 return False
  84             else:
  85                 return "<empty />"
  86     except KeyError:
  87         raise ValueError("'%s' is not a valid stylesheet.")
  88     except (XMLSyntaxError, XSLTApplyError), e:
  89         raise ParseError(e)
  90
  91 class Fragment(object):
  92     def __init__(self, id, themes):
  93         super(Fragment, self).__init__()
  94         self.id = id
  95         self.themes = themes
  96         self.events = []
  97
  98     def append(self, event, element):
  99         self.events.append((event, element))
 100
 101     def closed_events(self):
 102         stack = []
 103         for event, element in self.events:
 104             if event == 'start':
 105                 stack.append(('end', element))
 106             elif event == 'end':
 107                 try:
 108                     stack.pop()
 109                 except IndexError:
 110                     print 'CLOSED NON-OPEN TAG:', element
 111
 112         stack.reverse()
 113         return self.events + stack
 114
 115     def to_string(self):
 116         result = []
 117         for event, element in self.closed_events():
 118             if event == 'start':
 119                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 120                 if element.text:
 121                     result.append(element.text)
 122             elif event == 'end':
 123                 result.append(u'</%s>' % element.tag)
 124                 if element.tail:
 125                     result.append(element.tail)
 126             else:
 127                 result.append(element)
 128
 129         return ''.join(result)
 130
 131     def __unicode__(self):
 132         return self.to_string()
 133
 134
 135 def extract_fragments(input_filename):
 136     """Extracts theme fragments from input_filename."""
 137     open_fragments = {}
 138     closed_fragments = {}
 139
 140     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
 141         # Process begin and end elements
 142         if element.get('class', '') in ('theme-begin', 'theme-end'):
 143             if not event == 'end': continue # Process elements only once, on end event
 144
 145             # Open new fragment
 146             if element.get('class', '') == 'theme-begin':
 147                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 148
 149                 # Append parents
 150                 if element.getparent().get('id', None) != 'book-text':
 151                     parents = [element.getparent()]
 152                     while parents[-1].getparent().get('id', None) != 'book-text':
 153                         parents.append(parents[-1].getparent())
 154
 155                     parents.reverse()
 156                     for parent in parents:
 157                         fragment.append('start', parent)
 158
 159                 open_fragments[fragment.id] = fragment
 160
 161             # Close existing fragment
 162             else:
 163                 try:
 164                     fragment = open_fragments[element.get('fid')]
 165                 except KeyError:
 166                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 167                 else:
 168                     closed_fragments[fragment.id] = fragment
 169                     del open_fragments[fragment.id]
 170
 171             # Append element tail to lost_text (we don't want to lose any text)
 172             if element.tail:
 173                 for fragment_id in open_fragments:
 174                     open_fragments[fragment_id].append('text', element.tail)
 175
 176
 177         # Process all elements except begin and end
 178         else:
 179             # Omit annotation tags
 180             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 181                 if event == 'end' and element.tail:
 182                     for fragment_id in open_fragments:
 183                         open_fragments[fragment_id].append('text', element.tail)
 184             else:
 185                 for fragment_id in open_fragments:
 186                     open_fragments[fragment_id].append(event, copy.copy(element))
 187
 188     return closed_fragments, open_fragments
 189
 190
 191 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 192     if with_link:
 193         if link_text is None:
 194             link_text = prefix
 195         anchor = etree.Element('a', href='#%s' % prefix)
 196         anchor.set('class', 'anchor')
 197         anchor.text = unicode(link_text)
 198         if element.text:
 199             anchor.tail = element.text
 200             element.text = u''
 201         element.insert(0, anchor)
 202
 203     if with_target:
 204         anchor_target = etree.Element('a', name='%s' % prefix)
 205         anchor_target.set('class', 'target')
 206         anchor_target.text = u' '
 207         if element.text:
 208             anchor_target.tail = element.text
 209             element.text = u''
 210         element.insert(0, anchor_target)
 211
 212
 213 def any_ancestor(element, test):
 214     for ancestor in element.iterancestors():
 215         if test(ancestor):
 216             return True
 217     return False
 218
 219
 220 def add_anchors(root):
 221     counter = 1
 222     for element in root.iterdescendants():
 223         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 224         or e.tag == 'blockquote'):
 225             continue
 226
 227         if element.tag == 'p' and 'verse' in element.get('class', ''):
 228             if counter == 1 or counter % 5 == 0:
 229                 add_anchor(element, "f%d" % counter, link_text=counter)
 230             counter += 1
 231         elif 'paragraph' in element.get('class', ''):
 232             add_anchor(element, "f%d" % counter, link_text=counter)
 233             counter += 1
 234
 235
 236 def add_table_of_contents(root):
 237     sections = []
 238     counter = 1
 239     for element in root.iterdescendants():
 240         if element.tag in ('h2', 'h3'):
 241             if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
 242                 continue
 243
 244             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 245                 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
 246             else:
 247                 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
 248             add_anchor(element, "s%d" % counter, with_link=False)
 249             counter += 1
 250
 251     toc = etree.Element('div')
 252     toc.set('id', 'toc')
 253     toc_header = etree.SubElement(toc, 'h2')
 254     toc_header.text = u'Spis treści'
 255     toc_list = etree.SubElement(toc, 'ol')
 256
 257     for n, section, text, subsections in sections:
 258         section_element = etree.SubElement(toc_list, 'li')
 259         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 260
 261         if len(subsections):
 262             subsection_list = etree.SubElement(section_element, 'ol')
 263             for n, subsection, text, _ in subsections:
 264                 subsection_element = etree.SubElement(subsection_list, 'li')
 265                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 266
 267     root.insert(0, toc)
 268