Merge branch 'pdf'
[librarian.git] / librarian / html.py
index 1c141f3..5974d93 100644 (file)
@@ -268,10 +268,12 @@ def extract_annotations(html_path):
     """For each annotation, yields a tuple: anchor, text, html."""
     parser = etree.HTMLParser(encoding='utf-8')
     tree = etree.parse(html_path, parser)
-    for footnote in tree.find('//*[@id="footnotes"]').findall('div'):
-        anchor = footnote.find('a[@href]').get('href')
-        del footnote[:2]
-        text_str = etree.tostring(footnote, method='text', encoding='utf-8')
-        html_str = etree.tostring(footnote, method='html', encoding='utf-8')
-        yield anchor, text_str, html_str
+    footnotes = tree.find('//*[@id="footnotes"]')
+    if footnotes is not None:
+        for footnote in footnotes.findall('div'):
+            anchor = footnote.find('a[@name]').get('name')
+            del footnote[:2]
+            text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
+            html_str = etree.tostring(footnote, method='html', encoding='utf-8')
+            yield anchor, text_str, html_str