X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/18aa8ca52202003e5628a882f3469a04d905cc05..86530a9e72f32d28ef1971ac9fa705c85b1bd3b6:/src/search/index.py?ds=sidebyside diff --git a/src/search/index.py b/src/search/index.py index e0a727ca2..3b0edebf8 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -2,7 +2,8 @@ # Copyright © Fundacja Wolne Lektury. See NOTICE for more information. # import re -from librarian.parser import WLDocument +from librarian.elements.base import WLElement +from librarian.document import WLDocument from lxml import etree @@ -31,22 +32,15 @@ class Index: skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF'] - @classmethod - def get_master(cls, root): - """ - Returns the first master tag from an etree. - """ - for master in root.iter(): - if master.tag in cls.master_tags: - return master - @staticmethod - def add_snippet(book, text, position): + def add_snippet(book, text, position, anchor): book.snippet_set.create( sec=position + 1, - text=text + text=text, + anchor=anchor ) + # TODO: The section links stuff won't work. @classmethod def index_book(cls, book): """ @@ -57,13 +51,23 @@ class Index: book.snippet_set.all().delete() - wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False) - root = wld.edoc.getroot() + wld = WLDocument(filename=book.xml_file.path) + wld.assign_ids() - master = cls.get_master(root) + master = wld.tree.getroot().master if master is None: return [] + def get_indexable(element): + for child in element: + if not isinstance(child, WLElement): + continue + if not child.attrib.get('_id'): + for e in get_indexable(child): + yield e + else: + yield child + def walker(node): if node.tag not in cls.ignore_content_tags: yield node, None, None @@ -85,12 +89,14 @@ class Index: return re.sub("(?m)/$", "", text) - for position, header in enumerate(master): + for position, header in enumerate(get_indexable(master)): if header.tag in cls.skip_header_tags: continue if header.tag is etree.Comment: continue + el_id = header.attrib['_id'] + # section content content = [] footnote = [] @@ -110,7 +116,7 @@ class Index: handle_text.append(collect_footnote) elif end is not None and footnote is not [] and end.tag in cls.footnote_tags: handle_text.pop() - cls.add_snippet(book, ''.join(footnote), position) + cls.add_snippet(book, ''.join(footnote), position, el_id) footnote = [] if text is not None and handle_text is not []: @@ -118,4 +124,4 @@ class Index: hdl(text) # in the end, add a section text. - cls.add_snippet(book, fix_format(content), position) + cls.add_snippet(book, fix_format(content), position, el_id)