librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import re
   9 import copy
  10
  11 from lxml import etree
  12 from librarian.parser import WLDocument
  13 from librarian import XHTMLNS, ParseError
  14
  15 from lxml.etree import XMLSyntaxError, XSLTApplyError
  16
  17 ENTITY_SUBSTITUTIONS = [
  18     (u'---', u'—'),
  19     (u'--', u'–'),
  20     (u'...', u'…'),
  21     (u',,', u'„'),
  22     (u'"', u'”'),
  23 ]
  24
  25 STYLESHEETS = {
  26     'legacy': 'xslt/book2html.xslt',
  27     'full': 'xslt/wl2html_full.xslt',
  28     'partial': 'xslt/wl2html_partial.xslt'
  29 }
  30
  31 def get_stylesheet(name):
  32     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  33
  34 def substitute_entities(context, text):
  35     """XPath extension function converting all entites in passed text."""
  36     if isinstance(text, list):
  37         text = ''.join(text)
  38     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  39         text = text.replace(entity, substitutution)
  40     return text
  41
  42 # Register substitute_entities function with lxml
  43 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  44 ns['substitute_entities'] = substitute_entities
  45
  46 def transform(input, output_filename=None, is_file=True, \
  47     parse_dublincore=True, stylesheet='legacy', options={}):
  48     """Transforms file input_filename in XML to output_filename in XHTML.
  49
  50     If output_filename is None, returns an XML,
  51     otherwise returns True if file has been written,False if it hasn't.
  52     File won't be written if it has no content.
  53     """
  54     # Parse XSLT
  55     try:
  56         style_filename = get_stylesheet(stylesheet)
  57         style = etree.parse(style_filename)
  58
  59         if is_file:
  60             document = WLDocument.from_file(input, True, \
  61                 parse_dublincore=parse_dublincore)
  62         else:
  63             document = WLDocument.from_string(input, True, \
  64                 parse_dublincore=parse_dublincore)
  65
  66         result = document.transform(style, **options)
  67         del document # no longer needed large object :)
  68
  69         if etree.ETXPath('//p|//{%s}p' % str(XHTMLNS))(result):
  70             add_anchors(result.getroot())
  71             add_table_of_contents(result.getroot())
  72
  73             if output_filename is not None:
  74                 result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
  75             else:
  76                 return result
  77             return True
  78         else:
  79             if output_filename is not None:
  80                 return False
  81             else:
  82                 return "<empty />"
  83     except KeyError:
  84         raise ValueError("'%s' is not a valid stylesheet.")
  85     except (XMLSyntaxError, XSLTApplyError), e:
  86         raise ParseError(e)
  87
  88 class Fragment(object):
  89     def __init__(self, id, themes):
  90         super(Fragment, self).__init__()
  91         self.id = id
  92         self.themes = themes
  93         self.events = []
  94
  95     def append(self, event, element):
  96         self.events.append((event, element))
  97
  98     def closed_events(self):
  99         stack = []
 100         for event, element in self.events:
 101             if event == 'start':
 102                 stack.append(('end', element))
 103             elif event == 'end':
 104                 try:
 105                     stack.pop()
 106                 except IndexError:
 107                     print 'CLOSED NON-OPEN TAG:', element
 108
 109         stack.reverse()
 110         return self.events + stack
 111
 112     def to_string(self):
 113         result = []
 114         for event, element in self.closed_events():
 115             if event == 'start':
 116                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 117                 if element.text:
 118                     result.append(element.text)
 119             elif event == 'end':
 120                 result.append(u'</%s>' % element.tag)
 121                 if element.tail:
 122                     result.append(element.tail)
 123             else:
 124                 result.append(element)
 125
 126         return ''.join(result)
 127
 128     def __unicode__(self):
 129         return self.to_string()
 130
 131
 132 def extract_fragments(input_filename):
 133     """Extracts theme fragments from input_filename."""
 134     open_fragments = {}
 135     closed_fragments = {}
 136
 137     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
 138         # Process begin and end elements
 139         if element.get('class', '') in ('theme-begin', 'theme-end'):
 140             if not event == 'end': continue # Process elements only once, on end event
 141
 142             # Open new fragment
 143             if element.get('class', '') == 'theme-begin':
 144                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 145
 146                 # Append parents
 147                 if element.getparent().get('id', None) != 'book-text':
 148                     parents = [element.getparent()]
 149                     while parents[-1].getparent().get('id', None) != 'book-text':
 150                         parents.append(parents[-1].getparent())
 151
 152                     parents.reverse()
 153                     for parent in parents:
 154                         fragment.append('start', parent)
 155
 156                 open_fragments[fragment.id] = fragment
 157
 158             # Close existing fragment
 159             else:
 160                 try:
 161                     fragment = open_fragments[element.get('fid')]
 162                 except KeyError:
 163                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 164                 else:
 165                     closed_fragments[fragment.id] = fragment
 166                     del open_fragments[fragment.id]
 167
 168             # Append element tail to lost_text (we don't want to lose any text)
 169             if element.tail:
 170                 for fragment_id in open_fragments:
 171                     open_fragments[fragment_id].append('text', element.tail)
 172
 173
 174         # Process all elements except begin and end
 175         else:
 176             # Omit annotation tags
 177             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 178                 if event == 'end' and element.tail:
 179                     for fragment_id in open_fragments:
 180                         open_fragments[fragment_id].append('text', element.tail)
 181             else:
 182                 for fragment_id in open_fragments:
 183                     open_fragments[fragment_id].append(event, copy.copy(element))
 184
 185     return closed_fragments, open_fragments
 186
 187
 188 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 189     if with_link:
 190         if link_text is None:
 191             link_text = prefix
 192         anchor = etree.Element('a', href='#%s' % prefix)
 193         anchor.set('class', 'anchor')
 194         anchor.text = unicode(link_text)
 195         if element.text:
 196             anchor.tail = element.text
 197             element.text = u''
 198         element.insert(0, anchor)
 199
 200     if with_target:
 201         anchor_target = etree.Element('a', name='%s' % prefix)
 202         anchor_target.set('class', 'target')
 203         anchor_target.text = u' '
 204         if element.text:
 205             anchor_target.tail = element.text
 206             element.text = u''
 207         element.insert(0, anchor_target)
 208
 209
 210 def any_ancestor(element, test):
 211     for ancestor in element.iterancestors():
 212         if test(ancestor):
 213             return True
 214     return False
 215
 216
 217 def add_anchors(root):
 218     counter = 1
 219     for element in root.iterdescendants():
 220         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 221         or e.tag == 'blockquote'):
 222             continue
 223
 224         if element.tag == 'p' and 'verse' in element.get('class', ''):
 225             if counter == 1 or counter % 5 == 0:
 226                 add_anchor(element, "f%d" % counter, link_text=counter)
 227             counter += 1
 228         elif 'paragraph' in element.get('class', ''):
 229             add_anchor(element, "f%d" % counter, link_text=counter)
 230             counter += 1
 231
 232
 233 def add_table_of_contents(root):
 234     sections = []
 235     counter = 1
 236     for element in root.iterdescendants():
 237         if element.tag in ('h2', 'h3'):
 238             if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
 239                 continue
 240
 241             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 242                 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
 243             else:
 244                 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
 245             add_anchor(element, "s%d" % counter, with_link=False)
 246             counter += 1
 247
 248     toc = etree.Element('div')
 249     toc.set('id', 'toc')
 250     toc_header = etree.SubElement(toc, 'h2')
 251     toc_header.text = u'Spis treści'
 252     toc_list = etree.SubElement(toc, 'ol')
 253
 254     for n, section, text, subsections in sections:
 255         section_element = etree.SubElement(toc_list, 'li')
 256         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 257
 258         if len(subsections):
 259             subsection_list = etree.SubElement(section_element, 'ol')
 260             for n, subsection, text, _ in subsections:
 261                 subsection_element = etree.SubElement(subsection_list, 'li')
 262                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 263
 264     root.insert(0, toc)
 265