Minor fixups.
[librarian.git] / librarian / html.py
index 8703150..6115b31 100644 (file)
@@ -4,6 +4,7 @@
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 import os
+import re
 import cStringIO
 import copy
 
@@ -134,14 +135,17 @@ def extract_fragments(input_filename):
                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 
                 # Append parents
-                if element.getparent().get('id', None) != 'book-text':
-                    parents = [element.getparent()]
-                    while parents[-1].getparent().get('id', None) != 'book-text':
-                        parents.append(parents[-1].getparent())
-
-                    parents.reverse()
-                    for parent in parents:
-                        fragment.append('start', parent)
+                parent = element.getparent()
+                parents = []
+                while parent.get('id', None) != 'book-text':
+                    cparent = copy.deepcopy(parent)
+                    cparent.text = None
+                    parents.append(cparent)
+                    parent = parent.getparent()
+
+                parents.reverse()
+                for parent in parents:
+                    fragment.append('start', parent)
 
                 open_fragments[fragment.id] = fragment
 
@@ -222,7 +226,7 @@ def add_anchors(root):
 def raw_printable_text(element):
     working = copy.deepcopy(element)
     for e in working.findall('a'):
-        if e.get('class') == 'annotation':
+        if e.get('class') in ('annotation', 'theme-begin'):
             e.text = ''
     return etree.tostring(working, method='text', encoding=unicode).strip()
 
@@ -289,17 +293,44 @@ def add_table_of_themes(root):
     root.insert(0, themes_div)
 
 
-
 def extract_annotations(html_path):
-    """For each annotation, yields a tuple: anchor, text, html."""
+    """Extracts annotations from HTML for annotations dictionary.
+
+    For each annotation, yields a tuple of:
+    anchor, footnote type, valid qualifiers, text, html.
+
+    """
+    from .fn_qualifiers import FN_QUALIFIERS
+
     parser = etree.HTMLParser(encoding='utf-8')
     tree = etree.parse(html_path, parser)
     footnotes = tree.find('//*[@id="footnotes"]')
+    re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
     if footnotes is not None:
         for footnote in footnotes.findall('div'):
-            anchor = footnote.find('a[@name]').get('name')
+            fn_type = footnote.get('class').split('-')[1]
+            anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
             del footnote[:2]
-            text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip()
-            html_str = etree.tostring(footnote, method='html', encoding='utf-8')
-            yield anchor, text_str, html_str
+            footnote.text = None
+            if len(footnote) and footnote[-1].tail == '\n':
+                footnote[-1].tail = None
+            text_str = etree.tostring(footnote, method='text', encoding=unicode).strip()
+            html_str = etree.tostring(footnote, method='html', encoding=unicode).strip()
+
+            match = re_qualifier.match(text_str)
+            if match:
+                qualifier_str = match.group(1)
+                qualifiers = []
+                for candidate in re.split('[;,]', qualifier_str):
+                    candidate = candidate.strip()
+                    if candidate in FN_QUALIFIERS:
+                        qualifiers.append(candidate)
+                    elif candidate.startswith('z '):
+                        subcandidate = candidate.split()[1]
+                        if subcandidate in FN_QUALIFIERS:
+                            qualifiers.append(subcandidate)
+            else:
+                qualifiers = []
+
+            yield anchor, fn_type, qualifiers, text_str, html_str