librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import cStringIO
   8 import copy
   9
  10 from lxml import etree
  11 from librarian import XHTMLNS, ParseError, OutputFile
  12 from librarian import functions
  13
  14 from lxml.etree import XMLSyntaxError, XSLTApplyError
  15
  16 functions.reg_substitute_entities()
  17 functions.reg_person_name()
  18
  19 STYLESHEETS = {
  20     'legacy': 'xslt/book2html.xslt',
  21     'full': 'xslt/wl2html_full.xslt',
  22     'partial': 'xslt/wl2html_partial.xslt'
  23 }
  24
  25 def get_stylesheet(name):
  26     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  27
  28 def html_has_content(text):
  29     return etree.ETXPath('//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)})(text)
  30
  31 def transform(wldoc, stylesheet='legacy', options=None, flags=None):
  32     """Transforms the WL document to XHTML.
  33
  34     If output_filename is None, returns an XML,
  35     otherwise returns True if file has been written,False if it hasn't.
  36     File won't be written if it has no content.
  37     """
  38     # Parse XSLT
  39     try:
  40         style_filename = get_stylesheet(stylesheet)
  41         style = etree.parse(style_filename)
  42
  43         document = copy.deepcopy(wldoc)
  44         del wldoc
  45         document.swap_endlines()
  46
  47         if flags:
  48             for flag in flags:
  49                 document.edoc.getroot().set(flag, 'yes')
  50
  51         document.clean_ed_note()
  52
  53         if not options:
  54             options = {}
  55         result = document.transform(style, **options)
  56         del document # no longer needed large object :)
  57
  58         if html_has_content(result):
  59             add_anchors(result.getroot())
  60             add_table_of_themes(result.getroot())
  61             add_table_of_contents(result.getroot())
  62
  63             return OutputFile.from_string(etree.tostring(result, method='html',
  64                 xml_declaration=False, pretty_print=True, encoding='utf-8'))
  65         else:
  66             return None
  67     except KeyError:
  68         raise ValueError("'%s' is not a valid stylesheet.")
  69     except (XMLSyntaxError, XSLTApplyError), e:
  70         raise ParseError(e)
  71
  72 class Fragment(object):
  73     def __init__(self, id, themes):
  74         super(Fragment, self).__init__()
  75         self.id = id
  76         self.themes = themes
  77         self.events = []
  78
  79     def append(self, event, element):
  80         self.events.append((event, element))
  81
  82     def closed_events(self):
  83         stack = []
  84         for event, element in self.events:
  85             if event == 'start':
  86                 stack.append(('end', element))
  87             elif event == 'end':
  88                 try:
  89                     stack.pop()
  90                 except IndexError:
  91                     print 'CLOSED NON-OPEN TAG:', element
  92
  93         stack.reverse()
  94         return self.events + stack
  95
  96     def to_string(self):
  97         result = []
  98         for event, element in self.closed_events():
  99             if event == 'start':
 100                 result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items())))
 101                 if element.text:
 102                     result.append(element.text)
 103             elif event == 'end':
 104                 result.append(u'</%s>' % element.tag)
 105                 if element.tail:
 106                     result.append(element.tail)
 107             else:
 108                 result.append(element)
 109
 110         return ''.join(result)
 111
 112     def __unicode__(self):
 113         return self.to_string()
 114
 115
 116 def extract_fragments(input_filename):
 117     """Extracts theme fragments from input_filename."""
 118     open_fragments = {}
 119     closed_fragments = {}
 120
 121     # iterparse would die on a HTML document
 122     parser = etree.HTMLParser(encoding='utf-8')
 123     buf = cStringIO.StringIO()
 124     buf.write(etree.tostring(etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8'))
 125     buf.seek(0)
 126
 127     for event, element in etree.iterparse(buf, events=('start', 'end')):
 128         # Process begin and end elements
 129         if element.get('class', '') in ('theme-begin', 'theme-end'):
 130             if not event == 'end': continue # Process elements only once, on end event
 131
 132             # Open new fragment
 133             if element.get('class', '') == 'theme-begin':
 134                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 135
 136                 # Append parents
 137                 parent = element.getparent()
 138                 parents = []
 139                 while parent.get('id', None) != 'book-text':
 140                     cparent = copy.deepcopy(parent)
 141                     cparent.text = None
 142                     parents.append(cparent)
 143                     parent = parent.getparent()
 144
 145                 parents.reverse()
 146                 for parent in parents:
 147                     fragment.append('start', parent)
 148
 149                 open_fragments[fragment.id] = fragment
 150
 151             # Close existing fragment
 152             else:
 153                 try:
 154                     fragment = open_fragments[element.get('fid')]
 155                 except KeyError:
 156                     print '%s:closed not open fragment #%s' % (input_filename, element.get('fid'))
 157                 else:
 158                     closed_fragments[fragment.id] = fragment
 159                     del open_fragments[fragment.id]
 160
 161             # Append element tail to lost_text (we don't want to lose any text)
 162             if element.tail:
 163                 for fragment_id in open_fragments:
 164                     open_fragments[fragment_id].append('text', element.tail)
 165
 166
 167         # Process all elements except begin and end
 168         else:
 169             # Omit annotation tags
 170             if (len(element.get('name', '')) or
 171                     element.get('class', '') in ('annotation', 'anchor')):
 172                 if event == 'end' and element.tail:
 173                     for fragment_id in open_fragments:
 174                         open_fragments[fragment_id].append('text', element.tail)
 175             else:
 176                 for fragment_id in open_fragments:
 177                     open_fragments[fragment_id].append(event, copy.copy(element))
 178
 179     return closed_fragments, open_fragments
 180
 181
 182 def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
 183     parent = element.getparent()
 184     index = parent.index(element)
 185
 186     if with_link:
 187         if link_text is None:
 188             link_text = prefix
 189         anchor = etree.Element('a', href='#%s' % prefix)
 190         anchor.set('class', 'anchor')
 191         anchor.text = unicode(link_text)
 192         parent.insert(index, anchor)
 193
 194     if with_target:
 195         anchor_target = etree.Element('a', name='%s' % prefix)
 196         anchor_target.set('class', 'target')
 197         anchor_target.text = u' '
 198         parent.insert(index, anchor_target)
 199
 200
 201 def any_ancestor(element, test):
 202     for ancestor in element.iterancestors():
 203         if test(ancestor):
 204             return True
 205     return False
 206
 207
 208 def add_anchors(root):
 209     counter = 1
 210     for element in root.iterdescendants():
 211         if any_ancestor(element, lambda e: e.get('class') in ('note', 'motto', 'motto_podpis', 'dedication')
 212         or e.get('id') == 'nota_red'
 213         or e.tag == 'blockquote'):
 214             continue
 215
 216         if element.tag == 'p' and 'verse' in element.get('class', ''):
 217             if counter == 1 or counter % 5 == 0:
 218                 add_anchor(element, "f%d" % counter, link_text=counter)
 219             counter += 1
 220         elif 'paragraph' in element.get('class', ''):
 221             add_anchor(element, "f%d" % counter, link_text=counter)
 222             counter += 1
 223
 224
 225 def raw_printable_text(element):
 226     working = copy.deepcopy(element)
 227     for e in working.findall('a'):
 228         if e.get('class') in ('annotation', 'theme-begin'):
 229             e.text = ''
 230     return etree.tostring(working, method='text', encoding=unicode).strip()
 231
 232
 233 def add_table_of_contents(root):
 234     sections = []
 235     counter = 1
 236     for element in root.iterdescendants():
 237         if element.tag in ('h2', 'h3'):
 238             if any_ancestor(element, lambda e: e.get('id') in ('footnotes', 'nota_red') or e.get('class') in ('person-list',)):
 239                 continue
 240
 241             element_text = raw_printable_text(element)
 242             if element.tag == 'h3' and len(sections) and sections[-1][1] == 'h2':
 243                 sections[-1][3].append((counter, element.tag, element_text, []))
 244             else:
 245                 sections.append((counter, element.tag, element_text, []))
 246             add_anchor(element, "s%d" % counter, with_link=False)
 247             counter += 1
 248
 249     toc = etree.Element('div')
 250     toc.set('id', 'toc')
 251     toc_header = etree.SubElement(toc, 'h2')
 252     toc_header.text = u'Spis treści'
 253     toc_list = etree.SubElement(toc, 'ol')
 254
 255     for n, section, text, subsections in sections:
 256         section_element = etree.SubElement(toc_list, 'li')
 257         add_anchor(section_element, "s%d" % n, with_target=False, link_text=text)
 258
 259         if len(subsections):
 260             subsection_list = etree.SubElement(section_element, 'ol')
 261             for n, subsection, text, _ in subsections:
 262                 subsection_element = etree.SubElement(subsection_list, 'li')
 263                 add_anchor(subsection_element, "s%d" % n, with_target=False, link_text=text)
 264
 265     root.insert(0, toc)
 266
 267
 268 def add_table_of_themes(root):
 269     try:
 270         from sortify import sortify
 271     except ImportError:
 272         sortify = lambda x: x
 273
 274     book_themes = {}
 275     for fragment in root.findall('.//a[@class="theme-begin"]'):
 276         if not fragment.text:
 277             continue
 278         theme_names = [s.strip() for s in fragment.text.split(',')]
 279         for theme_name in theme_names:
 280             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 281     book_themes = book_themes.items()
 282     book_themes.sort(key=lambda s: sortify(s[0]))
 283     themes_div = etree.Element('div', id="themes")
 284     themes_ol = etree.SubElement(themes_div, 'ol')
 285     for theme_name, fragments in book_themes:
 286         themes_li = etree.SubElement(themes_ol, 'li')
 287         themes_li.text = "%s: " % theme_name
 288         for i, fragment in enumerate(fragments):
 289             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 290             item.text = str(i + 1)
 291             item.tail = ' '
 292     root.insert(0, themes_div)
 293
 294
 295
 296 def extract_annotations(html_path):
 297     """For each annotation, yields a tuple: anchor, text, html."""
 298     parser = etree.HTMLParser(encoding='utf-8')
 299     tree = etree.parse(html_path, parser)
 300     footnotes = tree.find('//*[@id="footnotes"]')
 301     if footnotes is not None:
 302         for footnote in footnotes.findall('div'):
 303             anchor = footnote.find('a[@name]').get('name')
 304             del footnote[:2]
 305             text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
 306             html_str = etree.tostring(footnote, method='html', encoding='utf-8')
 307             yield anchor, text_str, html_str
 308