1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from librarian.parser import WLDocument
16 'dramat_wierszowany_l',
17 'dramat_wierszowany_lp',
18 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
22 ignore_content_tags = [
23 'uwaga', 'extra', 'nota_red', 'abstrakt',
24 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
26 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', 'motyw'
29 footnote_tags = ['pa', 'pt', 'pr', 'pe']
31 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
32 '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
35 def get_master(cls, root):
37 Returns the first master tag from an etree.
39 for master in root.iter():
40 if master.tag in cls.master_tags:
44 def add_snippet(book, text, position):
45 book.snippet_set.create(
51 def index_book(cls, book):
53 Walks the book XML and extract content from it.
54 Adds parts for each header tag and for each fragment.
56 if not book.xml_file: return
58 book.snippet_set.all().delete()
60 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
61 root = wld.edoc.getroot()
63 master = cls.get_master(root)
68 if node.tag not in cls.ignore_content_tags:
69 yield node, None, None
70 if node.text is not None:
71 yield None, node.text, None
72 for child in list(node):
73 for b, t, e in walker(child):
75 yield None, None, node
77 if node.tail is not None:
78 yield None, node.tail, None
82 if isinstance(text, list):
83 text = filter(lambda s: s is not None, content)
86 return re.sub("(?m)/$", "", text)
88 for position, header in enumerate(master):
89 if header.tag in cls.skip_header_tags:
91 if header.tag is etree.Comment:
98 def all_content(text):
100 handle_text = [all_content]
102 for start, text, end in walker(header):
104 if start is not None and start.tag in cls.footnote_tags:
107 def collect_footnote(t):
110 handle_text.append(collect_footnote)
111 elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
113 cls.add_snippet(book, ''.join(footnote), position)
116 if text is not None and handle_text is not []:
117 hdl = handle_text[-1]
120 # in the end, add a section text.
121 cls.add_snippet(book, fix_format(content), position)