librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import re
   9 import copy
  10
  11 from lxml import etree
  12 from librarian.parser import WLDocument
  13 from librarian import XHTMLNS, ParseError
  14 from librarian import functions
  15
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
  17
  18 functions.reg_substitute_entities()
  19 functions.reg_person_name()
  20
  21 STYLESHEETS = {
  22     'legacy': 'xslt/book2html.xslt',
  23     'full': 'xslt/wl2html_full.xslt',
  24     'partial': 'xslt/wl2html_partial.xslt'
  25 }
  26
  27 def get_stylesheet(name):
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  29
  30 def html_has_content(text):
  31     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  32
  33 def transform(input, output_filename=None, is_file=True, \
  34     parse_dublincore=True, stylesheet='legacy', options={}, flags=None):
  35     """Transforms file input_filename in XML to output_filename in XHTML.
  36
  37     If output_filename is None, returns an XML,
  38     otherwise returns True if file has been written,False if it hasn't.
  39     File won't be written if it has no content.
  40     """
  41     # Parse XSLT
  42     try:
  43         style_filename = get_stylesheet(stylesheet)
  44         style = etree.parse(style_filename)
  45
  46         if is_file:
  47             document = WLDocument.from_file(input, True, \
  48                 parse_dublincore=parse_dublincore)
  49         else:
  50             document = WLDocument.from_string(input, True, \
  51                 parse_dublincore=parse_dublincore)
  52
  53         if flags:
  54             for flag in flags:
  55                 document.edoc.getroot().set(flag, 'yes')
  56
  57         document.clean_ed_note()
  58
  59         result = document.transform(style, **options)
  60         del document # no longer needed large object :)
  61
  62         if html_has_content(result):
  63             add_anchors(result.getroot())
  64             add_table_of_contents(result.getroot())
  65
  66             if output_filename is not None:
  67                 result.write(output_filename, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')
  68             else:
  69                 return result
  70             return True
  71         else:
  72             if output_filename is not None:
  73                 return False
  74             else:
  75                 return "<empty />"
  76     except KeyError:
  77         raise ValueError("'%s' is not a valid stylesheet.")
  78     except (XMLSyntaxError, XSLTApplyError), e:
  79         raise ParseError(e)
  80
  81 class Fragment(object):
  82     def __init__(self, id, themes):
  83         super(Fragment, self).__init__()
  84         self.id = id
  85         self.themes = themes
  86         self.events = []
  87
  88     def append(self, event, element):
  89         self.events.append((event, element))
  90
  91     def closed_events(self):
  92         stack = []
  93         for event, element in self.events:
  94             if event == 'start':
  95                 stack.append(('end', element))
  96             elif event == 'end':
  97                 try:
  98                     stack.pop()
  99                 except IndexError:
 100                     print 'CLOSED NON-OPEN TAG:', element
 101
 102         stack.reverse()
 103         return self.events + stack
 104
 105     def to_string(self):
 106         result = []
 107         for event, element in self.closed_events():
 108             if event == 'start':
 109                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 110                 if element.text:
 111                     result.append(element.text)
 112             elif event == 'end':
 113                 result.append(u'</%s>' % element.tag)
 114                 if element.tail:
 115                     result.append(element.tail)
 116             else:
 117                 result.append(element)
 118
 119         return ''.join(result)
 120
 121     def __unicode__(self):
 122         return self.to_string()
 123
 124
 125 def extract_fragments(input_filename):
 126     """Extracts theme fragments from input_filename."""
 127     open_fragments = {}
 128     closed_fragments = {}
 129
 130     # iterparse would die on a HTML document
 131     parser = etree.HTMLParser(encoding='utf-8')
 132     buf = cStringIO.StringIO()
 133     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 134     buf.seek(0)
 135
 136     for event, element in etree.iterparse(buf, events=('start', 'end')):
 137         # Process begin and end elements
 138         if element.get('class', '') in ('theme-begin', 'theme-end'):
 139             if not event == 'end': continue # Process elements only once, on end event
 140
 141             # Open new fragment
 142             if element.get('class', '') == 'theme-begin':
 143                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 144
 145                 # Append parents
 146                 if element.getparent().get('id', None) != 'book-text':
 147                     parents = [element.getparent()]
 148                     while parents[-1].getparent().get('id', None) != 'book-text':
 149                         parents.append(parents[-1].getparent())
 150
 151                     parents.reverse()
 152                     for parent in parents:
 153                         fragment.append('start', parent)
 154
 155                 open_fragments[fragment.id] = fragment
 156
 157             # Close existing fragment
 158             else:
 159                 try:
 160                     fragment = open_fragments[element.get('fid')]
 161                 except KeyError:
 162                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 163                 else:
 164                     closed_fragments[fragment.id] = fragment
 165                     del open_fragments[fragment.id]
 166
 167             # Append element tail to lost_text (we don't want to lose any text)
 168             if element.tail:
 169                 for fragment_id in open_fragments:
 170                     open_fragments[fragment_id].append('text', element.tail)
 171
 172
 173         # Process all elements except begin and end
 174         else:
 175             # Omit annotation tags
 176             if len(element.get('name', '')) or element.get('class', '') == 'annotation':
 177                 if event == 'end' and element.tail:
 178                     for fragment_id in open_fragments:
 179                         open_fragments[fragment_id].append('text', element.tail)
 180             else:
 181                 for fragment_id in open_fragments:
 182                     open_fragments[fragment_id].append(event, copy.copy(element))
 183
 184     return closed_fragments, open_fragments
 185
 186
 187 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 188     if with_link:
 189         if link_text is None:
 190             link_text = prefix
 191         anchor = etree.Element('a', href='#%s' % prefix)
 192         anchor.set('class', 'anchor')
 193         anchor.text = unicode(link_text)
 194         if element.text:
 195             anchor.tail = element.text
 196             element.text = u''
 197         element.insert(0, anchor)
 198
 199     if with_target:
 200         anchor_target = etree.Element('a', name='%s' % prefix)
 201         anchor_target.set('class', 'target')
 202         anchor_target.text = u' '
 203         if element.text:
 204             anchor_target.tail = element.text
 205             element.text = u''
 206         element.insert(0, anchor_target)
 207
 208
 209 def any_ancestor(element, test):
 210     for ancestor in element.iterancestors():
 211         if test(ancestor):
 212             return True
 213     return False
 214
 215
 216 def add_anchors(root):
 217     counter = 1
 218     for element in root.iterdescendants():
 219         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 220         or e.get('id') == 'nota_red'
 221         or e.tag == 'blockquote'):
 222             continue
 223
 224         if element.tag == 'p' and 'verse' in element.get('class', ''):
 225             if counter == 1 or counter % 5 == 0:
 226                 add_anchor(element, "f%d" % counter, link_text=counter)
 227             counter += 1
 228         elif 'paragraph' in element.get('class', ''):
 229             add_anchor(element, "f%d" % counter, link_text=counter)
 230             counter += 1
 231
 232
 233 def add_table_of_contents(root):
 234     sections = []
 235     counter = 1
 236     for element in root.iterdescendants():
 237         if element.tag in ('h2', 'h3'):
 238             if any_ancestor(element, lambda e: e.get('id') in ('footnotes',) or e.get('class') in ('person-list',)):
 239                 continue
 240
 241             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 242                 sections[-1][3].append((counter, element.tag, ''.join(element.xpath('text()')), []))
 243             else:
 244                 sections.append((counter, element.tag, ''.join(element.xpath('text()')), []))
 245             add_anchor(element, "s%d" % counter, with_link=False)
 246             counter += 1
 247
 248     toc = etree.Element('div')
 249     toc.set('id', 'toc')
 250     toc_header = etree.SubElement(toc, 'h2')
 251     toc_header.text = u'Spis treści'
 252     toc_list = etree.SubElement(toc, 'ol')
 253
 254     for n, section, text, subsections in sections:
 255         section_element = etree.SubElement(toc_list, 'li')
 256         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 257
 258         if len(subsections):
 259             subsection_list = etree.SubElement(section_element, 'ol')
 260             for n, subsection, text, _ in subsections:
 261                 subsection_element = etree.SubElement(subsection_list, 'li')
 262                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 263
 264     root.insert(0, toc)
 265
 266
 267 def extract_annotations(html_path):
 268     """For each annotation, yields a tuple: anchor, text, html."""
 269     parser = etree.HTMLParser(encoding='utf-8')
 270     tree = etree.parse(html_path, parser)
 271     footnotes = tree.find('//*[@id="footnotes"]')
 272     if footnotes is not None:
 273         for footnote in footnotes.findall('div'):
 274             anchor = footnote.find('a[@name]').get('name')
 275             del footnote[:2]
 276             text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
 277             html_str = etree.tostring(footnote, method='html', encoding='utf-8')
 278             yield anchor, text_str, html_str
 279