X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/e57b146bf49e38b3bb57615110b27de5b4d1ae69..ff2a09e9ecd8e9bede2d3572942bcd32f66f6198:/librarian/html.py diff --git a/librarian/html.py b/librarian/html.py index 70fc6e5..6115b31 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -4,6 +4,7 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # import os +import re import cStringIO import copy @@ -134,14 +135,17 @@ def extract_fragments(input_filename): fragment = Fragment(id=element.get('fid'), themes=element.text) # Append parents - if element.getparent().get('id', None) != 'book-text': - parents = [element.getparent()] - while parents[-1].getparent().get('id', None) != 'book-text': - parents.append(parents[-1].getparent()) - - parents.reverse() - for parent in parents: - fragment.append('start', parent) + parent = element.getparent() + parents = [] + while parent.get('id', None) != 'book-text': + cparent = copy.deepcopy(parent) + cparent.text = None + parents.append(cparent) + parent = parent.getparent() + + parents.reverse() + for parent in parents: + fragment.append('start', parent) open_fragments[fragment.id] = fragment @@ -177,25 +181,22 @@ def extract_fragments(input_filename): def add_anchor(element, prefix, with_link=True, with_target=True, link_text=None): + parent = element.getparent() + index = parent.index(element) + if with_link: if link_text is None: link_text = prefix anchor = etree.Element('a', href='#%s' % prefix) anchor.set('class', 'anchor') anchor.text = unicode(link_text) - if element.text: - anchor.tail = element.text - element.text = u'' - element.insert(0, anchor) + parent.insert(index, anchor) if with_target: anchor_target = etree.Element('a', name='%s' % prefix) anchor_target.set('class', 'target') anchor_target.text = u' ' - if element.text: - anchor_target.tail = element.text - element.text = u'' - element.insert(0, anchor_target) + parent.insert(index, anchor_target) def any_ancestor(element, test): @@ -225,7 +226,7 @@ def add_anchors(root): def raw_printable_text(element): working = copy.deepcopy(element) for e in working.findall('a'): - if e.get('class') == 'annotation': + if e.get('class') in ('annotation', 'theme-begin'): e.text = '' return etree.tostring(working, method='text', encoding=unicode).strip() @@ -292,17 +293,44 @@ def add_table_of_themes(root): root.insert(0, themes_div) - def extract_annotations(html_path): - """For each annotation, yields a tuple: anchor, text, html.""" + """Extracts annotations from HTML for annotations dictionary. + + For each annotation, yields a tuple of: + anchor, footnote type, valid qualifiers, text, html. + + """ + from .fn_qualifiers import FN_QUALIFIERS + parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') + re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014') if footnotes is not None: for footnote in footnotes.findall('div'): - anchor = footnote.find('a[@name]').get('name') + fn_type = footnote.get('class').split('-')[1] + anchor = footnote.find('a[@class="annotation"]').get('href')[1:] del footnote[:2] - text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip() - html_str = etree.tostring(footnote, method='html', encoding='utf-8') - yield anchor, text_str, html_str + footnote.text = None + if len(footnote) and footnote[-1].tail == '\n': + footnote[-1].tail = None + text_str = etree.tostring(footnote, method='text', encoding=unicode).strip() + html_str = etree.tostring(footnote, method='html', encoding=unicode).strip() + + match = re_qualifier.match(text_str) + if match: + qualifier_str = match.group(1) + qualifiers = [] + for candidate in re.split('[;,]', qualifier_str): + candidate = candidate.strip() + if candidate in FN_QUALIFIERS: + qualifiers.append(candidate) + elif candidate.startswith('z '): + subcandidate = candidate.split()[1] + if subcandidate in FN_QUALIFIERS: + qualifiers.append(subcandidate) + else: + qualifiers = [] + + yield anchor, fn_type, qualifiers, text_str, html_str