librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import copy
   9
  10 from lxml import etree
  11 from librarian import XHTMLNS, ParseError, IOFile
  12 from librarian import functions
  13
  14 from lxml.etree import XMLSyntaxError, XSLTApplyError
  15
  16 functions.reg_substitute_entities()
  17 functions.reg_person_name()
  18
  19 STYLESHEETS = {
  20     'legacy': 'xslt/book2html.xslt',
  21     'full': 'xslt/wl2html_full.xslt',
  22     'partial': 'xslt/wl2html_partial.xslt'
  23 }
  24
  25 def get_stylesheet(name):
  26     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  27
  28 def html_has_content(text):
  29     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  30
  31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
  32     """Transforms the WL document to XHTML.
  33
  34     If output_filename is None, returns an XML,
  35     otherwise returns True if file has been written,False if it hasn't.
  36     File won't be written if it has no content.
  37     """
  38     # Parse XSLT
  39     try:
  40         style_filename = get_stylesheet(stylesheet)
  41         style = etree.parse(style_filename)
  42
  43         document = copy.deepcopy(wldoc)
  44         del wldoc
  45         document.swap_endlines()
  46
  47         if flags:
  48             for flag in flags:
  49                 document.edoc.getroot().set(flag, 'yes')
  50
  51         document.clean_ed_note()
  52
  53         if not options:
  54             options = {}
  55         result = document.transform(style, **options)
  56         del document # no longer needed large object :)
  57
  58         if html_has_content(result):
  59             add_anchors(result.getroot())
  60             add_table_of_contents(result.getroot())
  61
  62             return IOFile.from_string(etree.tostring(result, method='html',
  63                 xml_declaration=False, pretty_print=True, encoding='utf-8'))
  64         else:
  65             return None
  66     except KeyError:
  67         raise ValueError("'%s' is not a valid stylesheet.")
  68     except (XMLSyntaxError, XSLTApplyError), e:
  69         raise ParseError(e)
  70
  71 class Fragment(object):
  72     def __init__(self, id, themes):
  73         super(Fragment, self).__init__()
  74         self.id = id
  75         self.themes = themes
  76         self.events = []
  77
  78     def append(self, event, element):
  79         self.events.append((event, element))
  80
  81     def closed_events(self):
  82         stack = []
  83         for event, element in self.events:
  84             if event == 'start':
  85                 stack.append(('end', element))
  86             elif event == 'end':
  87                 try:
  88                     stack.pop()
  89                 except IndexError:
  90                     print 'CLOSED NON-OPEN TAG:', element
  91
  92         stack.reverse()
  93         return self.events + stack
  94
  95     def to_string(self):
  96         result = []
  97         for event, element in self.closed_events():
  98             if event == 'start':
  99                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 100                 if element.text:
 101                     result.append(element.text)
 102             elif event == 'end':
 103                 result.append(u'</%s>' % element.tag)
 104                 if element.tail:
 105                     result.append(element.tail)
 106             else:
 107                 result.append(element)
 108
 109         return ''.join(result)
 110
 111     def __unicode__(self):
 112         return self.to_string()
 113
 114
 115 def extract_fragments(input_filename):
 116     """Extracts theme fragments from input_filename."""
 117     open_fragments = {}
 118     closed_fragments = {}
 119
 120     # iterparse would die on a HTML document
 121     parser = etree.HTMLParser(encoding='utf-8')
 122     buf = cStringIO.StringIO()
 123     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 124     buf.seek(0)
 125
 126     for event, element in etree.iterparse(buf, events=('start', 'end')):
 127         # Process begin and end elements
 128         if element.get('class', '') in ('theme-begin', 'theme-end'):
 129             if not event == 'end': continue # Process elements only once, on end event
 130
 131             # Open new fragment
 132             if element.get('class', '') == 'theme-begin':
 133                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 134
 135                 # Append parents
 136                 if element.getparent().get('id', None) != 'book-text':
 137                     parents = [element.getparent()]
 138                     while parents[-1].getparent().get('id', None) != 'book-text':
 139                         parents.append(parents[-1].getparent())
 140
 141                     parents.reverse()
 142                     for parent in parents:
 143                         fragment.append('start', parent)
 144
 145                 open_fragments[fragment.id] = fragment
 146
 147             # Close existing fragment
 148             else:
 149                 try:
 150                     fragment = open_fragments[element.get('fid')]
 151                 except KeyError:
 152                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 153                 else:
 154                     closed_fragments[fragment.id] = fragment
 155                     del open_fragments[fragment.id]
 156
 157             # Append element tail to lost_text (we don't want to lose any text)
 158             if element.tail:
 159                 for fragment_id in open_fragments:
 160                     open_fragments[fragment_id].append('text', element.tail)
 161
 162
 163         # Process all elements except begin and end
 164         else:
 165             # Omit annotation tags
 166             if (len(element.get('name', '')) or
 167                     element.get('class', '') in ('annotation', 'anchor')):
 168                 if event == 'end' and element.tail:
 169                     for fragment_id in open_fragments:
 170                         open_fragments[fragment_id].append('text', element.tail)
 171             else:
 172                 for fragment_id in open_fragments:
 173                     open_fragments[fragment_id].append(event, copy.copy(element))
 174
 175     return closed_fragments, open_fragments
 176
 177
 178 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 179     if with_link:
 180         if link_text is None:
 181             link_text = prefix
 182         anchor = etree.Element('a', href='#%s' % prefix)
 183         anchor.set('class', 'anchor')
 184         anchor.text = unicode(link_text)
 185         if element.text:
 186             anchor.tail = element.text
 187             element.text = u''
 188         element.insert(0, anchor)
 189
 190     if with_target:
 191         anchor_target = etree.Element('a', name='%s' % prefix)
 192         anchor_target.set('class', 'target')
 193         anchor_target.text = u' '
 194         if element.text:
 195             anchor_target.tail = element.text
 196             element.text = u''
 197         element.insert(0, anchor_target)
 198
 199
 200 def any_ancestor(element, test):
 201     for ancestor in element.iterancestors():
 202         if test(ancestor):
 203             return True
 204     return False
 205
 206
 207 def add_anchors(root):
 208     counter = 1
 209     for element in root.iterdescendants():
 210         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 211         or e.get('id') == 'nota_red'
 212         or e.tag == 'blockquote'):
 213             continue
 214
 215         if element.tag == 'p' and 'verse' in element.get('class', ''):
 216             if counter == 1 or counter % 5 == 0:
 217                 add_anchor(element, "f%d" % counter, link_text=counter)
 218             counter += 1
 219         elif 'paragraph' in element.get('class', ''):
 220             add_anchor(element, "f%d" % counter, link_text=counter)
 221             counter += 1
 222
 223
 224 def raw_printable_text(element):
 225     working = copy.deepcopy(element)
 226     for e in working.findall('a'):
 227         if e.get('class') == 'annotation':
 228             e.text = ''
 229     return etree.tostring(working, method='text', encoding=unicode).strip()
 230
 231
 232 def add_table_of_contents(root):
 233     sections = []
 234     counter = 1
 235     for element in root.iterdescendants():
 236         if element.tag in ('h2', 'h3'):
 237             if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
 238                 continue
 239
 240             element_text = raw_printable_text(element)
 241             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 242                 sections[-1][3].append((counter, element.tag, element_text, []))
 243             else:
 244                 sections.append((counter, element.tag, element_text, []))
 245             add_anchor(element, "s%d" % counter, with_link=False)
 246             counter += 1
 247
 248     toc = etree.Element('div')
 249     toc.set('id', 'toc')
 250     toc_header = etree.SubElement(toc, 'h2')
 251     toc_header.text = u'Spis treści'
 252     toc_list = etree.SubElement(toc, 'ol')
 253
 254     for n, section, text, subsections in sections:
 255         section_element = etree.SubElement(toc_list, 'li')
 256         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 257
 258         if len(subsections):
 259             subsection_list = etree.SubElement(section_element, 'ol')
 260             for n, subsection, text, _ in subsections:
 261                 subsection_element = etree.SubElement(subsection_list, 'li')
 262                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 263
 264     root.insert(0, toc)
 265
 266
 267 def extract_annotations(html_path):
 268     """For each annotation, yields a tuple: anchor, text, html."""
 269     parser = etree.HTMLParser(encoding='utf-8')
 270     tree = etree.parse(html_path, parser)
 271     footnotes = tree.find('//*[@id="footnotes"]')
 272     if footnotes is not None:
 273         for footnote in footnotes.findall('div'):
 274             anchor = footnote.find('a[@name]').get('name')
 275             del footnote[:2]
 276             text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
 277             html_str = etree.tostring(footnote, method='html', encoding='utf-8')
 278             yield anchor, text_str, html_str
 279