1 # This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
5 from librarian.elements.base import WLElement
6 from librarian.document import WLDocument
17 'dramat_wierszowany_l',
18 'dramat_wierszowany_lp',
19 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
23 ignore_content_tags = [
24 'uwaga', 'extra', 'nota_red', 'abstrakt',
25 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
27 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', 'motyw'
30 footnote_tags = ['pa', 'pt', 'pr', 'pe']
32 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
33 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
36 def add_snippet(book, text, position, anchor):
37 book.snippet_set.create(
43 # TODO: The section links stuff won't work.
45 def index_book(cls, book):
47 Walks the book XML and extract content from it.
48 Adds parts for each header tag and for each fragment.
50 if not book.xml_file: return
52 book.snippet_set.all().delete()
54 wld = WLDocument(filename=book.xml_file.path)
57 master = wld.tree.getroot().master
61 def get_indexable(element):
63 if not isinstance(child, WLElement):
65 if not child.attrib.get('_id'):
66 for e in get_indexable(child):
72 if node.tag not in cls.ignore_content_tags:
73 yield node, None, None
74 if node.text is not None:
75 yield None, node.text, None
76 for child in list(node):
77 for b, t, e in walker(child):
79 yield None, None, node
81 if node.tail is not None:
82 yield None, node.tail, None
86 if isinstance(text, list):
87 text = filter(lambda s: s is not None, content)
90 return re.sub("(?m)/$", "", text)
92 for position, header in enumerate(get_indexable(master)):
93 if header.tag in cls.skip_header_tags:
95 if header.tag is etree.Comment:
98 el_id = header.attrib['_id']
104 def all_content(text):
106 handle_text = [all_content]
108 for start, text, end in walker(header):
110 if start is not None and start.tag in cls.footnote_tags:
113 def collect_footnote(t):
116 handle_text.append(collect_footnote)
117 elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
119 cls.add_snippet(book, ''.join(footnote), position, el_id)
122 if text is not None and handle_text is not []:
123 hdl = handle_text[-1]
126 # in the end, add a section text.
127 cls.add_snippet(book, fix_format(content), position, el_id)