librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import copy
   9
  10 from lxml import etree
  11 from librarian import XHTMLNS, ParseError, IOFile
  12 from librarian import functions
  13
  14 from lxml.etree import XMLSyntaxError, XSLTApplyError
  15
  16 functions.reg_substitute_entities()
  17 functions.reg_person_name()
  18
  19 STYLESHEETS = {
  20     'legacy': 'xslt/book2html.xslt',
  21     'full': 'xslt/wl2html_full.xslt',
  22     'partial': 'xslt/wl2html_partial.xslt'
  23 }
  24
  25
  26 def get_stylesheet(name):
  27     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  28
  29
  30 def html_has_content(text):
  31     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  32
  33
  34 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
  35     """Transforms the WL document to XHTML.
  36
  37     If output_filename is None, returns an XML,
  38     otherwise returns True if file has been written,False if it hasn't.
  39     File won't be written if it has no content.
  40     """
  41     # Parse XSLT
  42     try:
  43         style_filename = get_stylesheet(stylesheet)
  44         style = etree.parse(style_filename)
  45
  46         document = copy.deepcopy(wldoc)
  47         del wldoc
  48         document.swap_endlines()
  49
  50         if flags:
  51             for flag in flags:
  52                 document.edoc.getroot().set(flag, 'yes')
  53
  54         document.clean_ed_note()
  55
  56         if not options:
  57             options = {}
  58         result = document.transform(style, **options)
  59         del document  # no longer needed large object :)
  60
  61         if html_has_content(result):
  62             add_anchors(result.getroot())
  63             add_table_of_contents(result.getroot())
  64
  65             return IOFile.from_string(
  66                 etree.tostring(result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8'))
  67         else:
  68             return None
  69     except KeyError:
  70         raise ValueError("'%s' is not a valid stylesheet.")
  71     except (XMLSyntaxError, XSLTApplyError), e:
  72         raise ParseError(e)
  73
  74
  75 class Fragment(object):
  76     def __init__(self, id, themes):
  77         super(Fragment, self).__init__()
  78         self.id = id
  79         self.themes = themes
  80         self.events = []
  81
  82     def append(self, event, element):
  83         self.events.append((event, element))
  84
  85     def closed_events(self):
  86         stack = []
  87         for event, element in self.events:
  88             if event == 'start':
  89                 stack.append(('end', element))
  90             elif event == 'end':
  91                 try:
  92                     stack.pop()
  93                 except IndexError:
  94                     print 'CLOSED NON-OPEN TAG:', element
  95
  96         stack.reverse()
  97         return self.events + stack
  98
  99     def to_string(self):
 100         result = []
 101         for event, element in self.closed_events():
 102             if event == 'start':
 103                 result.append(u'<%s %s>' % (
 104                     element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 105                 if element.text:
 106                     result.append(element.text)
 107             elif event == 'end':
 108                 result.append(u'</%s>' % element.tag)
 109                 if element.tail:
 110                     result.append(element.tail)
 111             else:
 112                 result.append(element)
 113
 114         return ''.join(result)
 115
 116     def __unicode__(self):
 117         return self.to_string()
 118
 119
 120 def extract_fragments(input_filename):
 121     """Extracts theme fragments from input_filename."""
 122     open_fragments = {}
 123     closed_fragments = {}
 124
 125     # iterparse would die on a HTML document
 126     parser = etree.HTMLParser(encoding='utf-8')
 127     buf = cStringIO.StringIO()
 128     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 129     buf.seek(0)
 130
 131     for event, element in etree.iterparse(buf, events=('start', 'end')):
 132         # Process begin and end elements
 133         if element.get('class', '') in ('theme-begin', 'theme-end'):
 134             if not event == 'end':
 135                 continue  # Process elements only once, on end event
 136
 137             # Open new fragment
 138             if element.get('class', '') == 'theme-begin':
 139                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 140
 141                 # Append parents
 142                 if element.getparent().get('id', None) != 'book-text':
 143                     parents = [element.getparent()]
 144                     while parents[-1].getparent().get('id', None) != 'book-text':
 145                         parents.append(parents[-1].getparent())
 146
 147                     parents.reverse()
 148                     for parent in parents:
 149                         fragment.append('start', parent)
 150
 151                 open_fragments[fragment.id] = fragment
 152
 153             # Close existing fragment
 154             else:
 155                 try:
 156                     fragment = open_fragments[element.get('fid')]
 157                 except KeyError:
 158                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 159                 else:
 160                     closed_fragments[fragment.id] = fragment
 161                     del open_fragments[fragment.id]
 162
 163             # Append element tail to lost_text (we don't want to lose any text)
 164             if element.tail:
 165                 for fragment_id in open_fragments:
 166                     open_fragments[fragment_id].append('text', element.tail)
 167
 168         # Process all elements except begin and end
 169         else:
 170             # Omit annotation tags
 171             if (len(element.get('name', '')) or
 172                     element.get('class', '') in ('annotation', 'anchor')):
 173                 if event == 'end' and element.tail:
 174                     for fragment_id in open_fragments:
 175                         open_fragments[fragment_id].append('text', element.tail)
 176             else:
 177                 for fragment_id in open_fragments:
 178                     open_fragments[fragment_id].append(event, copy.copy(element))
 179
 180     return closed_fragments, open_fragments
 181
 182
 183 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 184     if with_link:
 185         if link_text is None:
 186             link_text = prefix
 187         anchor = etree.Element('a', href='#%s' % prefix)
 188         anchor.set('class', 'anchor')
 189         anchor.text = unicode(link_text)
 190         if element.text:
 191             anchor.tail = element.text
 192             element.text = u''
 193         element.insert(0, anchor)
 194
 195     if with_target:
 196         anchor_target = etree.Element('a', name='%s' % prefix)
 197         anchor_target.set('class', 'target')
 198         anchor_target.text = u' '
 199         if element.text:
 200             anchor_target.tail = element.text
 201             element.text = u''
 202         element.insert(0, anchor_target)
 203
 204
 205 def any_ancestor(element, test):
 206     for ancestor in element.iterancestors():
 207         if test(ancestor):
 208             return True
 209     return False
 210
 211
 212 def add_anchors(root):
 213     counter = 1
 214
 215     def is_side_text(e):
 216         side_classes = ('note', 'motto', 'motto_podpis', 'dedication')
 217         return e.get('class') in side_classes or e.get('id') == 'nota_red' or e.tag == 'blockquote'
 218
 219     for element in root.iterdescendants():
 220         if any_ancestor(element, is_side_text):
 221             continue
 222
 223         if element.tag == 'p' and 'verse' in element.get('class', ''):
 224             if counter == 1 or counter % 5 == 0:
 225                 add_anchor(element, "f%d" % counter, link_text=counter)
 226             counter += 1
 227         elif 'paragraph' in element.get('class', ''):
 228             add_anchor(element, "f%d" % counter, link_text=counter)
 229             counter += 1
 230
 231
 232 def raw_printable_text(element):
 233     working = copy.deepcopy(element)
 234     for e in working.findall('a'):
 235         if e.get('class') == 'annotation':
 236             e.text = ''
 237     return etree.tostring(working, method='text', encoding=unicode).strip()
 238
 239
 240 def add_table_of_contents(root):
 241     sections = []
 242     counter = 1
 243
 244     def is_side_text(e):
 245         return e.get('id') in ('footnotes', 'nota_red') or e.get('class') == 'person-list'
 246
 247     for element in root.iterdescendants():
 248         if element.tag in ('h2', 'h3'):
 249             if any_ancestor(element, is_side_text):
 250                 continue
 251
 252             element_text = raw_printable_text(element)
 253             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 254                 sections[-1][3].append((counter, element.tag, element_text, []))
 255             else:
 256                 sections.append((counter, element.tag, element_text, []))
 257             add_anchor(element, "s%d" % counter, with_link=False)
 258             counter += 1
 259
 260     toc = etree.Element('div')
 261     toc.set('id', 'toc')
 262     toc_header = etree.SubElement(toc, 'h2')
 263     toc_header.text = u'Spis treści'
 264     toc_list = etree.SubElement(toc, 'ol')
 265
 266     for n, section, text, subsections in sections:
 267         section_element = etree.SubElement(toc_list, 'li')
 268         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 269
 270         if len(subsections):
 271             subsection_list = etree.SubElement(section_element, 'ol')
 272             for n1, subsection, text1, _ in subsections:
 273                 subsection_element = etree.SubElement(subsection_list, 'li')
 274                 add_anchor(subsection_element, "s%d" % n1, with_target=False, link_text=text1)
 275
 276     root.insert(0, toc)
 277
 278
 279 def extract_annotations(html_path):
 280     """For each annotation, yields a tuple: anchor, text, html."""
 281     parser = etree.HTMLParser(encoding='utf-8')
 282     tree = etree.parse(html_path, parser)
 283     footnotes = tree.find('//*[@id="footnotes"]')
 284     if footnotes is not None:
 285         for footnote in footnotes.findall('div'):
 286             anchor = footnote.find('a[@name]').get('name')
 287             del footnote[:2]
 288             text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
 289             html_str = etree.tostring(footnote, method='html', encoding='utf-8')
 290             yield anchor, text_str, html_str