# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
import os
+import re
import cStringIO
import copy
fragment = Fragment(id=element.get('fid'), themes=element.text)
# Append parents
- if element.getparent().get('id', None) != 'book-text':
- parents = [element.getparent()]
- while parents[-1].getparent().get('id', None) != 'book-text':
- parents.append(parents[-1].getparent())
-
- parents.reverse()
- for parent in parents:
- fragment.append('start', parent)
+ parent = element.getparent()
+ parents = []
+ while parent.get('id', None) != 'book-text':
+ cparent = copy.deepcopy(parent)
+ cparent.text = None
+ parents.append(cparent)
+ parent = parent.getparent()
+
+ parents.reverse()
+ for parent in parents:
+ fragment.append('start', parent)
open_fragments[fragment.id] = fragment
def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None):
+ parent = element.getparent()
+ index = parent.index(element)
+
if with_link:
if link_text is None:
link_text = prefix
anchor = etree.Element('a', href='#%s' % prefix)
anchor.set('class', 'anchor')
anchor.text = unicode(link_text)
- if element.text:
- anchor.tail = element.text
- element.text = u''
- element.insert(0, anchor)
+ parent.insert(index, anchor)
if with_target:
anchor_target = etree.Element('a', name='%s' % prefix)
anchor_target.set('class', 'target')
anchor_target.text = u' '
- if element.text:
- anchor_target.tail = element.text
- element.text = u''
- element.insert(0, anchor_target)
+ parent.insert(index, anchor_target)
def any_ancestor(element, test):
def raw_printable_text(element):
working = copy.deepcopy(element)
for e in working.findall('a'):
- if e.get('class') == 'annotation':
+ if e.get('class') in ('annotation', 'theme-begin'):
e.text = ''
return etree.tostring(working, method='text', encoding=unicode).strip()
for i, fragment in enumerate(fragments):
item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
item.text = str(i + 1)
+ item.tail = ' '
root.insert(0, themes_div)
-
def extract_annotations(html_path):
"""For each annotation, yields a tuple: anchor, text, html."""
parser = etree.HTMLParser(encoding='utf-8')
tree = etree.parse(html_path, parser)
footnotes = tree.find('//*[@id="footnotes"]')
+ re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
if footnotes is not None:
for footnote in footnotes.findall('div'):
- anchor = footnote.find('a[@name]').get('name')
+ fn_type = footnote.get('class').split('-')[1]
+ anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
del footnote[:2]
- text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
- html_str = etree.tostring(footnote, method='html', encoding='utf-8')
- yield anchor, text_str, html_str
+ footnote.text = None
+ if len(footnote) and footnote[-1].tail == '\n':
+ footnote[-1].tail = None
+ text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
+ html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
+ qualifier = None
+ match = re_qualifier.match(text_str)
+ if match:
+ qualifier = match.group(1)
+
+ yield anchor, fn_type, qualifier, text_str, html_str