Merge branch 'pdf'
[librarian.git] / librarian / html.py
index 5f832e3..5974d93 100644 (file)
@@ -263,3 +263,17 @@ def add_table_of_contents(root):
 
     root.insert(0, toc)
 
+
+def extract_annotations(html_path):
+    """For each annotation, yields a tuple: anchor, text, html."""
+    parser = etree.HTMLParser(encoding='utf-8')
+    tree = etree.parse(html_path, parser)
+    footnotes = tree.find('//*[@id="footnotes"]')
+    if footnotes is not None:
+        for footnote in footnotes.findall('div'):
+            anchor = footnote.find('a[@name]').get('name')
+            del footnote[:2]
+            text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
+            html_str = etree.tostring(footnote, method='html', encoding='utf-8')
+            yield anchor, text_str, html_str
+