X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/16554e130cc1a138c2352279f9c9e08d87683997..141733d3db8c11f1eb69a9c0195f07c3c2ed3f8f:/librarian/html.py diff --git a/librarian/html.py b/librarian/html.py index 0eeb76b..6115b31 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -4,6 +4,7 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # import os +import re import cStringIO import copy @@ -292,17 +293,44 @@ def add_table_of_themes(root): root.insert(0, themes_div) - def extract_annotations(html_path): - """For each annotation, yields a tuple: anchor, text, html.""" + """Extracts annotations from HTML for annotations dictionary. + + For each annotation, yields a tuple of: + anchor, footnote type, valid qualifiers, text, html. + + """ + from .fn_qualifiers import FN_QUALIFIERS + parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') + re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014') if footnotes is not None: for footnote in footnotes.findall('div'): - anchor = footnote.find('a[@name]').get('name') + fn_type = footnote.get('class').split('-')[1] + anchor = footnote.find('a[@class="annotation"]').get('href')[1:] del footnote[:2] - text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip() - html_str = etree.tostring(footnote, method='html', encoding='utf-8') - yield anchor, text_str, html_str + footnote.text = None + if len(footnote) and footnote[-1].tail == '\n': + footnote[-1].tail = None + text_str = etree.tostring(footnote, method='text', encoding=unicode).strip() + html_str = etree.tostring(footnote, method='html', encoding=unicode).strip() + + match = re_qualifier.match(text_str) + if match: + qualifier_str = match.group(1) + qualifiers = [] + for candidate in re.split('[;,]', qualifier_str): + candidate = candidate.strip() + if candidate in FN_QUALIFIERS: + qualifiers.append(candidate) + elif candidate.startswith('z '): + subcandidate = candidate.split()[1] + if subcandidate in FN_QUALIFIERS: + qualifiers.append(subcandidate) + else: + qualifiers = [] + + yield anchor, fn_type, qualifiers, text_str, html_str