librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import re
   9 import copy
  10
  11 from lxml import etree
  12 from librarian.parser import WLDocument
  13 from librarian import XHTMLNS, ParseError
  14 from librarian import functions
  15
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
  17
  18 functions.reg_substitute_entities()
  19 functions.reg_person_name()
  20
  21 STYLESHEETS = {
  22     'legacy': 'xslt/book2html.xslt',
  23     'full': 'xslt/wl2html_full.xslt',
  24     'partial': 'xslt/wl2html_partial.xslt'
  25 }
  26
  27 def get_stylesheet(name):
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  29
  30 def html_has_content(text):
  31     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  32
  33 def transform(input, output_filename=None, is_file=True, \
  34     parse_dublincore=True, stylesheet='legacy', options={}, flags=None):
  35     """Transforms file input_filename in XML to output_filename in XHTML.
  36
  37     If output_filename is None, returns an XML,
  38     otherwise returns True if file has been written,False if it hasn't.
  39     File won't be written if it has no content.
  40     """
  41     # Parse XSLT
  42     try:
  43         style_filename = get_stylesheet(stylesheet)
  44         style = etree.parse(style_filename)
  45
  46         if is_file:
  47             document = WLDocument.from_file(input, True, \
  48                 parse_dublincore=parse_dublincore)
  49         else:
  50             document = WLDocument.from_string(input, True, \
  51                 parse_dublincore=parse_dublincore)
  52
  53         if flags:
  54             for flag in flags:
  55                 document.edoc.getroot().set(flag, 'yes')
  56
  57         document.clean_ed_note()
  58
  59         result = document.transform(style, **options)
  60         del document # no longer needed large object :)
  61
  62         if html_has_content(result):
  63             add_anchors(result.getroot())
  64             add_table_of_contents(result.getroot())
  65
  66             if output_filename is not None:
  67                 result.write(output_filename, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')
  68             else:
  69                 return result
  70             return True
  71         else:
  72             if output_filename is not None:
  73                 return False
  74             else:
  75                 return "<empty />"
  76     except KeyError:
  77         raise ValueError("'%s' is not a valid stylesheet.")
  78     except (XMLSyntaxError, XSLTApplyError), e:
  79         raise ParseError(e)
  80
  81 class Fragment(object):
  82     def __init__(self, id, themes):
  83         super(Fragment, self).__init__()
  84         self.id = id
  85         self.themes = themes
  86         self.events = []
  87
  88     def append(self, event, element):
  89         self.events.append((event, element))
  90
  91     def closed_events(self):
  92         stack = []
  93         for event, element in self.events:
  94             if event == 'start':
  95                 stack.append(('end', element))
  96             elif event == 'end':
  97                 try:
  98                     stack.pop()
  99                 except IndexError:
 100                     print 'CLOSED NON-OPEN TAG:', element
 101
 102         stack.reverse()
 103         return self.events + stack
 104
 105     def to_string(self):
 106         result = []
 107         for event, element in self.closed_events():
 108             if event == 'start':
 109                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 110                 if element.text:
 111                     result.append(element.text)
 112             elif event == 'end':
 113                 result.append(u'</%s>' % element.tag)
 114                 if element.tail:
 115                     result.append(element.tail)
 116             else:
 117                 result.append(element)
 118
 119         return ''.join(result)
 120
 121     def __unicode__(self):
 122         return self.to_string()
 123
 124
 125 def extract_fragments(input_filename):
 126     """Extracts theme fragments from input_filename."""
 127     open_fragments = {}
 128     closed_fragments = {}
 129
 130     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
 131         # Process begin and end elements
 132         if element.get('class', '') in ('theme-begin', 'theme-end'):
 133             if not event == 'end': continue # Process elements only once, on end event
 134
 135             # Open new fragment
 136             if element.get('class', '') == 'theme-begin':
 137                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 138
 139                 # Append parents
 140                 if element.getparent().get('id', None) != 'book-text':
 141                     parents = [element.getparent()]
 142                     while parents[-1].getparent().get('id', None) != 'book-text':
 143                         parents.append(parents[-1].getparent())
 144
 145                     parents.reverse()
 146                     for parent in parents:
 147                         fragment.append('start', parent)
 148
 149                 open_fragments[fragment.id] = fragment
 150
 151             # Close existing fragment
 152             else:
 153                 try:
 154                     fragment = open_fragments[element.get('fid')]
 155                 except KeyError:
 156                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 157                 else:
 158                     closed_fragments[fragment.id] = fragment
 159                     del open_fragments[fragment.id]
 160
 161             # Append element tail to lost_text (we don't want to lose any text)
 162             if element.tail:
 163                 for fragment_id in open_fragments:
 164                     open_fragments[fragment_id].append('text', element.tail)
 165
 166
 167         # Process all elements except begin and end
 168         else:
 169             # Omit annotation tags
 170             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 171                 if event == 'end' and element.tail:
 172                     for fragment_id in open_fragments:
 173                         open_fragments[fragment_id].append('text', element.tail)
 174             else:
 175                 for fragment_id in open_fragments:
 176                     open_fragments[fragment_id].append(event, copy.copy(element))
 177
 178     return closed_fragments, open_fragments
 179
 180
 181 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 182     if with_link:
 183         if link_text is None:
 184             link_text = prefix
 185         anchor = etree.Element('a', href='#%s' % prefix)
 186         anchor.set('class', 'anchor')
 187         anchor.text = unicode(link_text)
 188         if element.text:
 189             anchor.tail = element.text
 190             element.text = u''
 191         element.insert(0, anchor)
 192
 193     if with_target:
 194         anchor_target = etree.Element('a', name='%s' % prefix)
 195         anchor_target.set('class', 'target')
 196         anchor_target.text = u' '
 197         if element.text:
 198             anchor_target.tail = element.text
 199             element.text = u''
 200         element.insert(0, anchor_target)
 201
 202
 203 def any_ancestor(element, test):
 204     for ancestor in element.iterancestors():
 205         if test(ancestor):
 206             return True
 207     return False
 208
 209
 210 def add_anchors(root):
 211     counter = 1
 212     for element in root.iterdescendants():
 213         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 214         or e.get('id') == 'nota_red'
 215         or e.tag == 'blockquote'):
 216             continue
 217
 218         if element.tag == 'p' and 'verse' in element.get('class', ''):
 219             if counter == 1 or counter % 5 == 0:
 220                 add_anchor(element, "f%d" % counter, link_text=counter)
 221             counter += 1
 222         elif 'paragraph' in element.get('class', ''):
 223             add_anchor(element, "f%d" % counter, link_text=counter)
 224             counter += 1
 225
 226
 227 def add_table_of_contents(root):
 228     sections = []
 229     counter = 1
 230     for element in root.iterdescendants():
 231         if element.tag in ('h2', 'h3'):
 232             if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
 233                 continue
 234
 235             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 236                 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
 237             else:
 238                 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
 239             add_anchor(element, "s%d" % counter, with_link=False)
 240             counter += 1
 241
 242     toc = etree.Element('div')
 243     toc.set('id', 'toc')
 244     toc_header = etree.SubElement(toc, 'h2')
 245     toc_header.text = u'Spis treści'
 246     toc_list = etree.SubElement(toc, 'ol')
 247
 248     for n, section, text, subsections in sections:
 249         section_element = etree.SubElement(toc_list, 'li')
 250         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 251
 252         if len(subsections):
 253             subsection_list = etree.SubElement(section_element, 'ol')
 254             for n, subsection, text, _ in subsections:
 255                 subsection_element = etree.SubElement(subsection_list, 'li')
 256                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 257
 258     root.insert(0, toc)
 259