librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import re
   9 import copy
  10
  11 from lxml import etree
  12 from librarian.parser import WLDocument
  13 from librarian import XHTMLNS, ParseError
  14 from librarian import functions
  15
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
  17
  18 functions.reg_substitute_entities()
  19
  20 STYLESHEETS = {
  21     'legacy': 'xslt/book2html.xslt',
  22     'full': 'xslt/wl2html_full.xslt',
  23     'partial': 'xslt/wl2html_partial.xslt'
  24 }
  25
  26 def get_stylesheet(name):
  27     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  28
  29 def html_has_content(text):
  30     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  31
  32 def transform(input, output_filename=None, is_file=True, \
  33     parse_dublincore=True, stylesheet='legacy', options={}):
  34     """Transforms file input_filename in XML to output_filename in XHTML.
  35
  36     If output_filename is None, returns an XML,
  37     otherwise returns True if file has been written,False if it hasn't.
  38     File won't be written if it has no content.
  39     """
  40     # Parse XSLT
  41     try:
  42         style_filename = get_stylesheet(stylesheet)
  43         style = etree.parse(style_filename)
  44
  45         if is_file:
  46             document = WLDocument.from_file(input, True, \
  47                 parse_dublincore=parse_dublincore)
  48         else:
  49             document = WLDocument.from_string(input, True, \
  50                 parse_dublincore=parse_dublincore)
  51
  52         document.clean_ed_note()
  53
  54         result = document.transform(style, **options)
  55         del document # no longer needed large object :)
  56
  57         if html_has_content(result):
  58             add_anchors(result.getroot())
  59             add_table_of_contents(result.getroot())
  60
  61             if output_filename is not None:
  62                 result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
  63             else:
  64                 return result
  65             return True
  66         else:
  67             if output_filename is not None:
  68                 return False
  69             else:
  70                 return "<empty />"
  71     except KeyError:
  72         raise ValueError("'%s' is not a valid stylesheet.")
  73     except (XMLSyntaxError, XSLTApplyError), e:
  74         raise ParseError(e)
  75
  76 class Fragment(object):
  77     def __init__(self, id, themes):
  78         super(Fragment, self).__init__()
  79         self.id = id
  80         self.themes = themes
  81         self.events = []
  82
  83     def append(self, event, element):
  84         self.events.append((event, element))
  85
  86     def closed_events(self):
  87         stack = []
  88         for event, element in self.events:
  89             if event == 'start':
  90                 stack.append(('end', element))
  91             elif event == 'end':
  92                 try:
  93                     stack.pop()
  94                 except IndexError:
  95                     print 'CLOSED NON-OPEN TAG:', element
  96
  97         stack.reverse()
  98         return self.events + stack
  99
 100     def to_string(self):
 101         result = []
 102         for event, element in self.closed_events():
 103             if event == 'start':
 104                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 105                 if element.text:
 106                     result.append(element.text)
 107             elif event == 'end':
 108                 result.append(u'</%s>' % element.tag)
 109                 if element.tail:
 110                     result.append(element.tail)
 111             else:
 112                 result.append(element)
 113
 114         return ''.join(result)
 115
 116     def __unicode__(self):
 117         return self.to_string()
 118
 119
 120 def extract_fragments(input_filename):
 121     """Extracts theme fragments from input_filename."""
 122     open_fragments = {}
 123     closed_fragments = {}
 124
 125     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
 126         # Process begin and end elements
 127         if element.get('class', '') in ('theme-begin', 'theme-end'):
 128             if not event == 'end': continue # Process elements only once, on end event
 129
 130             # Open new fragment
 131             if element.get('class', '') == 'theme-begin':
 132                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 133
 134                 # Append parents
 135                 if element.getparent().get('id', None) != 'book-text':
 136                     parents = [element.getparent()]
 137                     while parents[-1].getparent().get('id', None) != 'book-text':
 138                         parents.append(parents[-1].getparent())
 139
 140                     parents.reverse()
 141                     for parent in parents:
 142                         fragment.append('start', parent)
 143
 144                 open_fragments[fragment.id] = fragment
 145
 146             # Close existing fragment
 147             else:
 148                 try:
 149                     fragment = open_fragments[element.get('fid')]
 150                 except KeyError:
 151                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 152                 else:
 153                     closed_fragments[fragment.id] = fragment
 154                     del open_fragments[fragment.id]
 155
 156             # Append element tail to lost_text (we don't want to lose any text)
 157             if element.tail:
 158                 for fragment_id in open_fragments:
 159                     open_fragments[fragment_id].append('text', element.tail)
 160
 161
 162         # Process all elements except begin and end
 163         else:
 164             # Omit annotation tags
 165             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 166                 if event == 'end' and element.tail:
 167                     for fragment_id in open_fragments:
 168                         open_fragments[fragment_id].append('text', element.tail)
 169             else:
 170                 for fragment_id in open_fragments:
 171                     open_fragments[fragment_id].append(event, copy.copy(element))
 172
 173     return closed_fragments, open_fragments
 174
 175
 176 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 177     if with_link:
 178         if link_text is None:
 179             link_text = prefix
 180         anchor = etree.Element('a', href='#%s' % prefix)
 181         anchor.set('class', 'anchor')
 182         anchor.text = unicode(link_text)
 183         if element.text:
 184             anchor.tail = element.text
 185             element.text = u''
 186         element.insert(0, anchor)
 187
 188     if with_target:
 189         anchor_target = etree.Element('a', name='%s' % prefix)
 190         anchor_target.set('class', 'target')
 191         anchor_target.text = u' '
 192         if element.text:
 193             anchor_target.tail = element.text
 194             element.text = u''
 195         element.insert(0, anchor_target)
 196
 197
 198 def any_ancestor(element, test):
 199     for ancestor in element.iterancestors():
 200         if test(ancestor):
 201             return True
 202     return False
 203
 204
 205 def add_anchors(root):
 206     counter = 1
 207     for element in root.iterdescendants():
 208         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 209         or e.get('id') == 'nota_red'
 210         or e.tag == 'blockquote'):
 211             continue
 212
 213         if element.tag == 'p' and 'verse' in element.get('class', ''):
 214             if counter == 1 or counter % 5 == 0:
 215                 add_anchor(element, "f%d" % counter, link_text=counter)
 216             counter += 1
 217         elif 'paragraph' in element.get('class', ''):
 218             add_anchor(element, "f%d" % counter, link_text=counter)
 219             counter += 1
 220
 221
 222 def add_table_of_contents(root):
 223     sections = []
 224     counter = 1
 225     for element in root.iterdescendants():
 226         if element.tag in ('h2', 'h3'):
 227             if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
 228                 continue
 229
 230             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 231                 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
 232             else:
 233                 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
 234             add_anchor(element, "s%d" % counter, with_link=False)
 235             counter += 1
 236
 237     toc = etree.Element('div')
 238     toc.set('id', 'toc')
 239     toc_header = etree.SubElement(toc, 'h2')
 240     toc_header.text = u'Spis treści'
 241     toc_list = etree.SubElement(toc, 'ol')
 242
 243     for n, section, text, subsections in sections:
 244         section_element = etree.SubElement(toc_list, 'li')
 245         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 246
 247         if len(subsections):
 248             subsection_list = etree.SubElement(section_element, 'ol')
 249             for n, subsection, text, _ in subsections:
 250                 subsection_element = etree.SubElement(subsection_list, 'li')
 251                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 252
 253     root.insert(0, toc)
 254