- for footnote in tree.find('//*[@id="footnotes"]').findall('div'):
- anchor = footnote.find('a[@href]').get('href')
- del footnote[:2]
- text_str = etree.tostring(footnote, method='text', encoding='utf-8')
- html_str = etree.tostring(footnote, method='html', encoding='utf-8')
- yield anchor, text_str, html_str
+ footnotes = tree.find('//*[@id="footnotes"]')
+ re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
+ if footnotes is not None:
+ for footnote in footnotes.findall('div'):
+ fn_type = footnote.get('class').split('-')[1]
+ anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
+ del footnote[:2]
+ footnote.text = None
+ if len(footnote) and footnote[-1].tail == '\n':
+ footnote[-1].tail = None
+ text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
+ html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
+
+ match = re_qualifier.match(text_str)
+ if match:
+ qualifier_str = match.group(1)
+ qualifiers = []
+ for candidate in re.split('[;,]', qualifier_str):
+ candidate = candidate.strip()
+ if candidate in FN_QUALIFIERS:
+ qualifiers.append(candidate)
+ elif candidate.startswith('z '):
+ subcandidate = candidate.split()[1]
+ if subcandidate in FN_QUALIFIERS:
+ qualifiers.append(subcandidate)
+ else:
+ qualifiers = []
+
+ yield anchor, fn_type, qualifiers, text_str, html_str