librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import re
   9 import copy
  10
  11 from lxml import etree
  12 from librarian.parser import WLDocument
  13 from librarian import XHTMLNS, ParseError
  14 from librarian import functions
  15
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
  17
  18 functions.reg_substitute_entities()
  19
  20 STYLESHEETS = {
  21     'legacy': 'xslt/book2html.xslt',
  22     'full': 'xslt/wl2html_full.xslt',
  23     'partial': 'xslt/wl2html_partial.xslt'
  24 }
  25
  26 def get_stylesheet(name):
  27     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  28
  29 def html_has_content(text):
  30     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  31
  32 def transform(input, output_filename=None, is_file=True, \
  33     parse_dublincore=True, stylesheet='legacy', options={}):
  34     """Transforms file input_filename in XML to output_filename in XHTML.
  35
  36     If output_filename is None, returns an XML,
  37     otherwise returns True if file has been written,False if it hasn't.
  38     File won't be written if it has no content.
  39     """
  40     # Parse XSLT
  41     try:
  42         style_filename = get_stylesheet(stylesheet)
  43         style = etree.parse(style_filename)
  44
  45         if is_file:
  46             document = WLDocument.from_file(input, True, \
  47                 parse_dublincore=parse_dublincore)
  48         else:
  49             document = WLDocument.from_string(input, True, \
  50                 parse_dublincore=parse_dublincore)
  51
  52         result = document.transform(style, **options)
  53         del document # no longer needed large object :)
  54
  55         if html_has_content(result):
  56             add_anchors(result.getroot())
  57             add_table_of_contents(result.getroot())
  58
  59             if output_filename is not None:
  60                 result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
  61             else:
  62                 return result
  63             return True
  64         else:
  65             if output_filename is not None:
  66                 return False
  67             else:
  68                 return "<empty />"
  69     except KeyError:
  70         raise ValueError("'%s' is not a valid stylesheet.")
  71     except (XMLSyntaxError, XSLTApplyError), e:
  72         raise ParseError(e)
  73
  74 class Fragment(object):
  75     def __init__(self, id, themes):
  76         super(Fragment, self).__init__()
  77         self.id = id
  78         self.themes = themes
  79         self.events = []
  80
  81     def append(self, event, element):
  82         self.events.append((event, element))
  83
  84     def closed_events(self):
  85         stack = []
  86         for event, element in self.events:
  87             if event == 'start':
  88                 stack.append(('end', element))
  89             elif event == 'end':
  90                 try:
  91                     stack.pop()
  92                 except IndexError:
  93                     print 'CLOSED NON-OPEN TAG:', element
  94
  95         stack.reverse()
  96         return self.events + stack
  97
  98     def to_string(self):
  99         result = []
 100         for event, element in self.closed_events():
 101             if event == 'start':
 102                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 103                 if element.text:
 104                     result.append(element.text)
 105             elif event == 'end':
 106                 result.append(u'</%s>' % element.tag)
 107                 if element.tail:
 108                     result.append(element.tail)
 109             else:
 110                 result.append(element)
 111
 112         return ''.join(result)
 113
 114     def __unicode__(self):
 115         return self.to_string()
 116
 117
 118 def extract_fragments(input_filename):
 119     """Extracts theme fragments from input_filename."""
 120     open_fragments = {}
 121     closed_fragments = {}
 122
 123     for event, element in etree.iterparse(input_filename, events=('start', 'end')):
 124         # Process begin and end elements
 125         if element.get('class', '') in ('theme-begin', 'theme-end'):
 126             if not event == 'end': continue # Process elements only once, on end event
 127
 128             # Open new fragment
 129             if element.get('class', '') == 'theme-begin':
 130                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 131
 132                 # Append parents
 133                 if element.getparent().get('id', None) != 'book-text':
 134                     parents = [element.getparent()]
 135                     while parents[-1].getparent().get('id', None) != 'book-text':
 136                         parents.append(parents[-1].getparent())
 137
 138                     parents.reverse()
 139                     for parent in parents:
 140                         fragment.append('start', parent)
 141
 142                 open_fragments[fragment.id] = fragment
 143
 144             # Close existing fragment
 145             else:
 146                 try:
 147                     fragment = open_fragments[element.get('fid')]
 148                 except KeyError:
 149                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 150                 else:
 151                     closed_fragments[fragment.id] = fragment
 152                     del open_fragments[fragment.id]
 153
 154             # Append element tail to lost_text (we don't want to lose any text)
 155             if element.tail:
 156                 for fragment_id in open_fragments:
 157                     open_fragments[fragment_id].append('text', element.tail)
 158
 159
 160         # Process all elements except begin and end
 161         else:
 162             # Omit annotation tags
 163             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 164                 if event == 'end' and element.tail:
 165                     for fragment_id in open_fragments:
 166                         open_fragments[fragment_id].append('text', element.tail)
 167             else:
 168                 for fragment_id in open_fragments:
 169                     open_fragments[fragment_id].append(event, copy.copy(element))
 170
 171     return closed_fragments, open_fragments
 172
 173
 174 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 175     if with_link:
 176         if link_text is None:
 177             link_text = prefix
 178         anchor = etree.Element('a', href='#%s' % prefix)
 179         anchor.set('class', 'anchor')
 180         anchor.text = unicode(link_text)
 181         if element.text:
 182             anchor.tail = element.text
 183             element.text = u''
 184         element.insert(0, anchor)
 185
 186     if with_target:
 187         anchor_target = etree.Element('a', name='%s' % prefix)
 188         anchor_target.set('class', 'target')
 189         anchor_target.text = u' '
 190         if element.text:
 191             anchor_target.tail = element.text
 192             element.text = u''
 193         element.insert(0, anchor_target)
 194
 195
 196 def any_ancestor(element, test):
 197     for ancestor in element.iterancestors():
 198         if test(ancestor):
 199             return True
 200     return False
 201
 202
 203 def add_anchors(root):
 204     counter = 1
 205     for element in root.iterdescendants():
 206         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 207         or e.tag == 'blockquote'):
 208             continue
 209
 210         if element.tag == 'p' and 'verse' in element.get('class', ''):
 211             if counter == 1 or counter % 5 == 0:
 212                 add_anchor(element, "f%d" % counter, link_text=counter)
 213             counter += 1
 214         elif 'paragraph' in element.get('class', ''):
 215             add_anchor(element, "f%d" % counter, link_text=counter)
 216             counter += 1
 217
 218
 219 def add_table_of_contents(root):
 220     sections = []
 221     counter = 1
 222     for element in root.iterdescendants():
 223         if element.tag in ('h2', 'h3'):
 224             if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
 225                 continue
 226
 227             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 228                 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
 229             else:
 230                 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
 231             add_anchor(element, "s%d" % counter, with_link=False)
 232             counter += 1
 233
 234     toc = etree.Element('div')
 235     toc.set('id', 'toc')
 236     toc_header = etree.SubElement(toc, 'h2')
 237     toc_header.text = u'Spis treści'
 238     toc_list = etree.SubElement(toc, 'ol')
 239
 240     for n, section, text, subsections in sections:
 241         section_element = etree.SubElement(toc_list, 'li')
 242         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 243
 244         if len(subsections):
 245             subsection_list = etree.SubElement(section_element, 'ol')
 246             for n, subsection, text, _ in subsections:
 247                 subsection_element = etree.SubElement(subsection_list, 'li')
 248                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 249
 250     root.insert(0, toc)
 251