librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 #    This file is part of Librarian.
   4 #
   5 #    Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska <fundacja@nowoczesnapolska.org.pl>
   6 #
   7 #    For full list of contributors see AUTHORS file.
   8 #
   9 #    This program is free software: you can redistribute it and/or modify
  10 #    it under the terms of the GNU Affero General Public License as published by
  11 #    the Free Software Foundation, either version 3 of the License, or
  12 #    (at your option) any later version.
  13 #
  14 #    This program is distributed in the hope that it will be useful,
  15 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 #    GNU Affero General Public License for more details.
  18 #
  19 #    You should have received a copy of the GNU Affero General Public License
  20 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21 #
  22 import os
  23 import cStringIO
  24 import re
  25 import copy
  26
  27 from lxml import etree
  28 from librarian.parser import WLDocument
  29 from librarian import XHTMLNS, ParseError
  30
  31 from lxml.etree import XMLSyntaxError, XSLTApplyError
  32
  33 ENTITY_SUBSTITUTIONS = [
  34     (u'---', u'—'),
  35     (u'--', u'–'),
  36     (u'...', u'…'),
  37     (u',,', u'„'),
  38     (u'"', u'”'),
  39 ]
  40
  41 STYLESHEETS = {
  42     'legacy': 'xslt/book2html.xslt',
  43     'full': 'xslt/wl2html_full.xslt',
  44     'partial': 'xslt/wl2html_partial.xslt'
  45 }
  46
  47 def get_stylesheet(name):
  48     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  49
  50 def substitute_entities(context, text):
  51     """XPath extension function converting all entites in passed text."""
  52     if isinstance(text, list):
  53         text = ''.join(text)
  54     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  55         text = text.replace(entity, substitutution)
  56     return text
  57
  58 # Register substitute_entities function with lxml
  59 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  60 ns['substitute_entities'] = substitute_entities
  61
  62 def transform(input, output_filename=None, is_file=True, \
  63     parse_dublincore=True, stylesheet='legacy', options={}):
  64     """Transforms file input_filename in XML to output_filename in XHTML."""
  65     # Parse XSLT
  66     try:
  67         style_filename = get_stylesheet(stylesheet)
  68         style = etree.parse(style_filename)
  69
  70         if is_file:
  71             document = WLDocument.from_file(input, True, \
  72                 parse_dublincore=parse_dublincore)
  73         else:
  74             document = WLDocument.from_string(input, True, \
  75                 parse_dublincore=parse_dublincore)
  76
  77         result = document.transform(style, **options)
  78         del document # no longer needed large object :)
  79
  80         if etree.ETXPath('//p|//{%s}p' % str(XHTMLNS))(result) is not None:
  81             add_anchors(result.getroot())
  82             add_table_of_contents(result.getroot())
  83
  84             if output_filename is not None:
  85                 result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
  86             else:
  87                 return result
  88             return True
  89         else:
  90             return "<empty />"
  91     except KeyError:
  92         raise ValueError("'%s' is not a valid stylesheet.")
  93     except (XMLSyntaxError, XSLTApplyError), e:
  94         raise ParseError(e)
  95
  96 class Fragment(object):
  97     def __init__(self, id, themes):
  98         super(Fragment, self).__init__()
  99         self.id = id
 100         self.themes = themes
 101         self.events = []
 102
 103     def append(self, event, element):
 104         self.events.append((event, element))
 105
 106     def closed_events(self):
 107         stack = []
 108         for event, element in self.events:
 109             if event == 'start':
 110                 stack.append(('end', element))
 111             elif event == 'end':
 112                 try:
 113                     stack.pop()
 114                 except IndexError:
 115                     print 'CLOSED NON-OPEN TAG:', element
 116
 117         stack.reverse()
 118         return self.events + stack
 119
 120     def to_string(self):
 121         result = []
 122         for event, element in self.closed_events():
 123             if event == 'start':
 124                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 125                 if element.text:
 126                     result.append(element.text)
 127             elif event == 'end':
 128                 result.append(u'</%s>' % element.tag)
 129                 if element.tail:
 130                     result.append(element.tail)
 131             else:
 132                 result.append(element)
 133
 134         return ''.join(result)
 135
 136     def __unicode__(self):
 137         return self.to_string()
 138
 139
 140 def extract_fragments(input_filename):
 141     """Extracts theme fragments from input_filename."""
 142     open_fragments = {}
 143     closed_fragments = {}
 144
 145     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
 146         # Process begin and end elements
 147         if element.get('class', '') in ('theme-begin', 'theme-end'):
 148             if not event == 'end': continue # Process elements only once, on end event
 149
 150             # Open new fragment
 151             if element.get('class', '') == 'theme-begin':
 152                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 153
 154                 # Append parents
 155                 if element.getparent().get('id', None) != 'book-text':
 156                     parents = [element.getparent()]
 157                     while parents[-1].getparent().get('id', None) != 'book-text':
 158                         parents.append(parents[-1].getparent())
 159
 160                     parents.reverse()
 161                     for parent in parents:
 162                         fragment.append('start', parent)
 163
 164                 open_fragments[fragment.id] = fragment
 165
 166             # Close existing fragment
 167             else:
 168                 try:
 169                     fragment = open_fragments[element.get('fid')]
 170                 except KeyError:
 171                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 172                 else:
 173                     closed_fragments[fragment.id] = fragment
 174                     del open_fragments[fragment.id]
 175
 176             # Append element tail to lost_text (we don't want to lose any text)
 177             if element.tail:
 178                 for fragment_id in open_fragments:
 179                     open_fragments[fragment_id].append('text', element.tail)
 180
 181
 182         # Process all elements except begin and end
 183         else:
 184             # Omit annotation tags
 185             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 186                 if event == 'end' and element.tail:
 187                     for fragment_id in open_fragments:
 188                         open_fragments[fragment_id].append('text', element.tail)
 189             else:
 190                 for fragment_id in open_fragments:
 191                     open_fragments[fragment_id].append(event, copy.copy(element))
 192
 193     return closed_fragments, open_fragments
 194
 195
 196 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 197     if with_link:
 198         if link_text is None:
 199             link_text = prefix
 200         anchor = etree.Element('a', href='#%s' % prefix)
 201         anchor.set('class', 'anchor')
 202         anchor.text = unicode(link_text)
 203         if element.text:
 204             anchor.tail = element.text
 205             element.text = u''
 206         element.insert(0, anchor)
 207
 208     if with_target:
 209         anchor_target = etree.Element('a', name='%s' % prefix)
 210         anchor_target.set('class', 'target')
 211         anchor_target.text = u' '
 212         if element.text:
 213             anchor_target.tail = element.text
 214             element.text = u''
 215         element.insert(0, anchor_target)
 216
 217
 218 def any_ancestor(element, test):
 219     for ancestor in element.iterancestors():
 220         if test(ancestor):
 221             return True
 222     return False
 223
 224
 225 def add_anchors(root):
 226     counter = 1
 227     for element in root.iterdescendants():
 228         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 229         or e.tag == 'blockquote'):
 230             continue
 231
 232         if element.tag == 'p' and 'verse' in element.get('class', ''):
 233             if counter == 1 or counter % 5 == 0:
 234                 add_anchor(element, "f%d" % counter, link_text=counter)
 235             counter += 1
 236         elif 'paragraph' in element.get('class', ''):
 237             add_anchor(element, "f%d" % counter, link_text=counter)
 238             counter += 1
 239
 240
 241 def add_table_of_contents(root):
 242     sections = []
 243     counter = 1
 244     for element in root.iterdescendants():
 245         if element.tag in ('h2', 'h3'):
 246             if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
 247                 continue
 248
 249             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 250                 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
 251             else:
 252                 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
 253             add_anchor(element, "s%d" % counter, with_link=False)
 254             counter += 1
 255
 256     toc = etree.Element('div')
 257     toc.set('id', 'toc')
 258     toc_header = etree.SubElement(toc, 'h2')
 259     toc_header.text = u'Spis treści'
 260     toc_list = etree.SubElement(toc, 'ol')
 261
 262     for n, section, text, subsections in sections:
 263         section_element = etree.SubElement(toc_list, 'li')
 264         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 265
 266         if len(subsections):
 267             subsection_list = etree.SubElement(section_element, 'ol')
 268             for n, subsection, text, _ in subsections:
 269                 subsection_element = etree.SubElement(subsection_list, 'li')
 270                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 271
 272     root.insert(0, toc)
 273