librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import re
   9 import copy
  10
  11 from lxml import etree
  12 from librarian.parser import WLDocument
  13 from librarian import XHTMLNS, ParseError
  14 from librarian import functions
  15
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
  17
  18 functions.reg_substitute_entities()
  19
  20 STYLESHEETS = {
  21     'legacy': 'xslt/book2html.xslt',
  22     'full': 'xslt/wl2html_full.xslt',
  23     'partial': 'xslt/wl2html_partial.xslt'
  24 }
  25
  26 def get_stylesheet(name):
  27     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  28
  29 def html_has_content(text):
  30     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  31
  32 def transform(input, output_filename=None, is_file=True, \
  33     parse_dublincore=True, stylesheet='legacy', options={}, flags=None):
  34     """Transforms file input_filename in XML to output_filename in XHTML.
  35
  36     If output_filename is None, returns an XML,
  37     otherwise returns True if file has been written,False if it hasn't.
  38     File won't be written if it has no content.
  39     """
  40     # Parse XSLT
  41     try:
  42         style_filename = get_stylesheet(stylesheet)
  43         style = etree.parse(style_filename)
  44
  45         if is_file:
  46             document = WLDocument.from_file(input, True, \
  47                 parse_dublincore=parse_dublincore)
  48         else:
  49             document = WLDocument.from_string(input, True, \
  50                 parse_dublincore=parse_dublincore)
  51
  52         if flags:
  53             for flag in flags:
  54                 document.edoc.getroot().set(flag, 'yes')
  55
  56         document.clean_ed_note()
  57
  58         result = document.transform(style, **options)
  59         del document # no longer needed large object :)
  60
  61         if html_has_content(result):
  62             add_anchors(result.getroot())
  63             add_table_of_contents(result.getroot())
  64
  65             if output_filename is not None:
  66                 result.write(output_filename, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')
  67             else:
  68                 return result
  69             return True
  70         else:
  71             if output_filename is not None:
  72                 return False
  73             else:
  74                 return "<empty />"
  75     except KeyError:
  76         raise ValueError("'%s' is not a valid stylesheet.")
  77     except (XMLSyntaxError, XSLTApplyError), e:
  78         raise ParseError(e)
  79
  80 class Fragment(object):
  81     def __init__(self, id, themes):
  82         super(Fragment, self).__init__()
  83         self.id = id
  84         self.themes = themes
  85         self.events = []
  86
  87     def append(self, event, element):
  88         self.events.append((event, element))
  89
  90     def closed_events(self):
  91         stack = []
  92         for event, element in self.events:
  93             if event == 'start':
  94                 stack.append(('end', element))
  95             elif event == 'end':
  96                 try:
  97                     stack.pop()
  98                 except IndexError:
  99                     print 'CLOSED NON-OPEN TAG:', element
 100
 101         stack.reverse()
 102         return self.events + stack
 103
 104     def to_string(self):
 105         result = []
 106         for event, element in self.closed_events():
 107             if event == 'start':
 108                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 109                 if element.text:
 110                     result.append(element.text)
 111             elif event == 'end':
 112                 result.append(u'</%s>' % element.tag)
 113                 if element.tail:
 114                     result.append(element.tail)
 115             else:
 116                 result.append(element)
 117
 118         return ''.join(result)
 119
 120     def __unicode__(self):
 121         return self.to_string()
 122
 123
 124 def extract_fragments(input_filename):
 125     """Extracts theme fragments from input_filename."""
 126     open_fragments = {}
 127     closed_fragments = {}
 128
 129     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
 130         # Process begin and end elements
 131         if element.get('class', '') in ('theme-begin', 'theme-end'):
 132             if not event == 'end': continue # Process elements only once, on end event
 133
 134             # Open new fragment
 135             if element.get('class', '') == 'theme-begin':
 136                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 137
 138                 # Append parents
 139                 if element.getparent().get('id', None) != 'book-text':
 140                     parents = [element.getparent()]
 141                     while parents[-1].getparent().get('id', None) != 'book-text':
 142                         parents.append(parents[-1].getparent())
 143
 144                     parents.reverse()
 145                     for parent in parents:
 146                         fragment.append('start', parent)
 147
 148                 open_fragments[fragment.id] = fragment
 149
 150             # Close existing fragment
 151             else:
 152                 try:
 153                     fragment = open_fragments[element.get('fid')]
 154                 except KeyError:
 155                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 156                 else:
 157                     closed_fragments[fragment.id] = fragment
 158                     del open_fragments[fragment.id]
 159
 160             # Append element tail to lost_text (we don't want to lose any text)
 161             if element.tail:
 162                 for fragment_id in open_fragments:
 163                     open_fragments[fragment_id].append('text', element.tail)
 164
 165
 166         # Process all elements except begin and end
 167         else:
 168             # Omit annotation tags
 169             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 170                 if event == 'end' and element.tail:
 171                     for fragment_id in open_fragments:
 172                         open_fragments[fragment_id].append('text', element.tail)
 173             else:
 174                 for fragment_id in open_fragments:
 175                     open_fragments[fragment_id].append(event, copy.copy(element))
 176
 177     return closed_fragments, open_fragments
 178
 179
 180 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 181     if with_link:
 182         if link_text is None:
 183             link_text = prefix
 184         anchor = etree.Element('a', href='#%s' % prefix)
 185         anchor.set('class', 'anchor')
 186         anchor.text = unicode(link_text)
 187         if element.text:
 188             anchor.tail = element.text
 189             element.text = u''
 190         element.insert(0, anchor)
 191
 192     if with_target:
 193         anchor_target = etree.Element('a', name='%s' % prefix)
 194         anchor_target.set('class', 'target')
 195         anchor_target.text = u' '
 196         if element.text:
 197             anchor_target.tail = element.text
 198             element.text = u''
 199         element.insert(0, anchor_target)
 200
 201
 202 def any_ancestor(element, test):
 203     for ancestor in element.iterancestors():
 204         if test(ancestor):
 205             return True
 206     return False
 207
 208
 209 def add_anchors(root):
 210     counter = 1
 211     for element in root.iterdescendants():
 212         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 213         or e.get('id') == 'nota_red'
 214         or e.tag == 'blockquote'):
 215             continue
 216
 217         if element.tag == 'p' and 'verse' in element.get('class', ''):
 218             if counter == 1 or counter % 5 == 0:
 219                 add_anchor(element, "f%d" % counter, link_text=counter)
 220             counter += 1
 221         elif 'paragraph' in element.get('class', ''):
 222             add_anchor(element, "f%d" % counter, link_text=counter)
 223             counter += 1
 224
 225
 226 def add_table_of_contents(root):
 227     sections = []
 228     counter = 1
 229     for element in root.iterdescendants():
 230         if element.tag in ('h2', 'h3'):
 231             if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
 232                 continue
 233
 234             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 235                 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
 236             else:
 237                 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
 238             add_anchor(element, "s%d" % counter, with_link=False)
 239             counter += 1
 240
 241     toc = etree.Element('div')
 242     toc.set('id', 'toc')
 243     toc_header = etree.SubElement(toc, 'h2')
 244     toc_header.text = u'Spis treści'
 245     toc_list = etree.SubElement(toc, 'ol')
 246
 247     for n, section, text, subsections in sections:
 248         section_element = etree.SubElement(toc_list, 'li')
 249         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 250
 251         if len(subsections):
 252             subsection_list = etree.SubElement(section_element, 'ol')
 253             for n, subsection, text, _ in subsections:
 254                 subsection_element = etree.SubElement(subsection_list, 'li')
 255                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 256
 257     root.insert(0, toc)
 258