X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/2c73c162844627d26991915fecc7e39f02bc34b8..477d10c7febb6f2225eb9df94d0729bbf230cd15:/librarian/html.py diff --git a/librarian/html.py b/librarian/html.py index e084ed2..6115b31 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -294,11 +294,18 @@ def add_table_of_themes(root): def extract_annotations(html_path): - """For each annotation, yields a tuple: anchor, text, html.""" + """Extracts annotations from HTML for annotations dictionary. + + For each annotation, yields a tuple of: + anchor, footnote type, valid qualifiers, text, html. + + """ + from .fn_qualifiers import FN_QUALIFIERS + parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') - re_qualifier = re.compile(ur'[^\u2014]+\s+\((.+)\)\s+\u2014') + re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014') if footnotes is not None: for footnote in footnotes.findall('div'): fn_type = footnote.get('class').split('-')[1] @@ -309,10 +316,21 @@ def extract_annotations(html_path): footnote[-1].tail = None text_str = etree.tostring(footnote, method='text', encoding=unicode).strip() html_str = etree.tostring(footnote, method='html', encoding=unicode).strip() - qualifier = None + match = re_qualifier.match(text_str) if match: - qualifier = match.group(1) + qualifier_str = match.group(1) + qualifiers = [] + for candidate in re.split('[;,]', qualifier_str): + candidate = candidate.strip() + if candidate in FN_QUALIFIERS: + qualifiers.append(candidate) + elif candidate.startswith('z '): + subcandidate = candidate.split()[1] + if subcandidate in FN_QUALIFIERS: + qualifiers.append(subcandidate) + else: + qualifiers = [] - yield anchor, fn_type, qualifier, text_str, html_str + yield anchor, fn_type, qualifiers, text_str, html_str