X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/bca485bab3f6b59cdee68d3c3c3bb006c0006e97..6511b439dadc8201e81ae932f8a015daf7a81fb8:/librarian/html.py diff --git a/librarian/html.py b/librarian/html.py index 8703150..0eeb76b 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -134,14 +134,17 @@ def extract_fragments(input_filename): fragment = Fragment(id=element.get('fid'), themes=element.text) # Append parents - if element.getparent().get('id', None) != 'book-text': - parents = [element.getparent()] - while parents[-1].getparent().get('id', None) != 'book-text': - parents.append(parents[-1].getparent()) - - parents.reverse() - for parent in parents: - fragment.append('start', parent) + parent = element.getparent() + parents = [] + while parent.get('id', None) != 'book-text': + cparent = copy.deepcopy(parent) + cparent.text = None + parents.append(cparent) + parent = parent.getparent() + + parents.reverse() + for parent in parents: + fragment.append('start', parent) open_fragments[fragment.id] = fragment @@ -222,7 +225,7 @@ def add_anchors(root): def raw_printable_text(element): working = copy.deepcopy(element) for e in working.findall('a'): - if e.get('class') == 'annotation': + if e.get('class') in ('annotation', 'theme-begin'): e.text = '' return etree.tostring(working, method='text', encoding=unicode).strip()