block join search works, wrapped queries.
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2 from django.conf import settings
3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
4     NumericField, Version, Document, JavaError, IndexSearcher, \
5     QueryParser, Term, PerFieldAnalyzerWrapper, \
6     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
7     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
8     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
9     HashSet, BooleanClause, Term
10     # KeywordAnalyzer
11 import os
12 import errno
13 from librarian import dcparser
14 from librarian.parser import WLDocument
15 from catalogue.models import Book
16
17
18 class WLAnalyzer(PerFieldAnalyzerWrapper):
19     def __init__(self):
20         polish = PolishAnalyzer(Version.LUCENE_34)
21         simple = SimpleAnalyzer(Version.LUCENE_34)
22         keyword = KeywordAnalyzer(Version.LUCENE_34)
23         # not sure if needed: there's NOT_ANALYZED meaning basically the same
24
25         PerFieldAnalyzerWrapper.__init__(self, polish)
26
27         self.addAnalyzer("tags", simple)
28         self.addAnalyzer("technical_editors", simple)
29         self.addAnalyzer("editors", simple)
30         self.addAnalyzer("url", keyword)
31         self.addAnalyzer("source_url", keyword)
32         self.addAnalyzer("source_name", simple)
33         self.addAnalyzer("publisher", simple)
34         self.addAnalyzer("author", simple)
35         self.addAnalyzer("is_book", keyword)
36
37         #self.addanalyzer("fragment_anchor", keyword)
38
39
40 class IndexStore(object):
41     def __init__(self):
42         self.make_index_dir()
43         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
44
45     def make_index_dir(self):
46         try:
47             os.makedirs(settings.SEARCH_INDEX)
48         except OSError as exc:
49             if exc.errno == errno.EEXIST:
50                 pass
51             else: raise
52
53
54 class Index(IndexStore):
55     def __init__(self, analyzer=None):
56         IndexStore.__init__(self)
57         self.index = None
58         if not analyzer:
59             analyzer = WLAnalyzer()
60         self.analyzer = analyzer
61
62     def open(self, analyzer=None):
63         if self.index:
64             raise Exception("Index is already opened")
65         self.index = IndexWriter(self.store, self.analyzer,\
66                                  IndexWriter.MaxFieldLength.LIMITED)
67         return self.index
68
69     def close(self):
70         self.index.optimize()
71         self.index.close()
72         self.index = None
73
74     def remove_book(self, book):
75         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
76         self.index.deleteDocuments(q)
77
78     def index_book(self, book, overwrite=True):
79         if overwrite:
80             self.remove_book(book)
81
82         doc = self.extract_metadata(book)
83         parts = self.extract_content(book)
84         block = ArrayList().of_(Document)
85
86         for p in parts:
87             block.add(p)
88         block.add(doc)
89         self.index.addDocuments(block)
90
91     master_tags = [
92         'opowiadanie',
93         'powiesc',
94         'dramat_wierszowany_l',
95         'dramat_wierszowany_lp',
96         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
97         'wywiad'
98         ]
99
100     def create_book_doc(self, book):
101         """
102         Create a lucene document connected to the book
103         """
104         doc = Document()
105         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
106         if book.parent is not None:
107             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
108         return doc
109
110     def extract_metadata(self, book):
111         book_info = dcparser.parse(book.xml_file)
112
113         doc = self.create_book_doc(book)
114         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
115         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
116         doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
117
118         # validator, name
119         for field in dcparser.BookInfo.FIELDS:
120             if hasattr(book_info, field.name):
121                 if not getattr(book_info, field.name):
122                     continue
123                 # since no type information is available, we use validator
124                 type_indicator = field.validator
125                 if type_indicator == dcparser.as_unicode:
126                     s = getattr(book_info, field.name)
127                     if field.multiple:
128                         s = ', '.join(s)
129                     try:
130                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
131                     except JavaError as je:
132                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
133                 elif type_indicator == dcparser.as_person:
134                     p = getattr(book_info, field.name)
135                     if isinstance(p, dcparser.Person):
136                         persons = unicode(p)
137                     else:
138                         persons = ', '.join(map(unicode, p))
139                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
140                 elif type_indicator == dcparser.as_date:
141                     dt = getattr(book_info, field.name)
142                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
143         return doc
144
145     def get_master(self, root):
146         for master in root.iter():
147             if master.tag in self.master_tags:
148                 return master
149
150     def extract_content(self, book):
151         wld = WLDocument.from_file(book.xml_file.path)
152         root = wld.edoc.getroot()
153
154         # first we build a sequence of top-level items.
155         # book_id
156         # header_index - the 0-indexed position of header element.
157         # content
158         master = self.get_master(root)
159         header_docs = []
160         for header, position in zip(list(master), range(len(master))):
161             print("header %s @%d" % (header, position))
162             doc = self.create_book_doc(book)
163             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
164             doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
165             content = u' '.join([t for t in header.itertext()])
166             doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
167             header_docs.append(doc)
168
169         def walker(node):
170             yield node, None
171             for child in list(node):
172                 for b, e in walker(child):
173                     yield b, e
174             yield None, node
175             return
176
177         # Then we create a document for each fragments
178         # fragment_anchor - the anchor
179         # themes - list of themes [not indexed]
180         fragment_docs = []
181         # will contain (framgent id -> { content: [], themes: [] }
182         fragments = {}
183         for start, end in walker(master):
184             print("%s %s" % (start, end))
185             if start is not None and start.tag == 'begin':
186                 fid = start.attrib['id'][1:]
187                 fragments[fid] = {'content': [], 'themes': []}
188                 fragments[fid]['content'].append(start.tail)
189             elif start is not None and start.tag == 'motyw':
190                 fid = start.attrib['id'][1:]
191                 fragments[fid]['themes'].append(start.text)
192                 fragments[fid]['content'].append(start.tail)
193             elif start is not None and start.tag == 'end':
194                 fid = start.attrib['id'][1:]
195                 frag = fragments[fid]
196                 del fragments[fid]
197                 print("Fragment %s complete, themes: %s contents: %s" % (fid, frag['themes'], frag['content']))
198
199                 doc = self.create_book_doc(book)
200                 doc.add(Field("fragment_anchor", fid, Field.Store.YES, Field.Index.NOT_ANALYZED))
201                 doc.add(Field("content", u' '.join(filter(lambda s: s is not None, frag['content'])), Field.Store.NO, Field.Index.ANALYZED))
202                 doc.add(Field("themes", u' '.join(frag['themes']), Field.Store.NO, Field.Index.ANALYZED))
203                 fragment_docs.append(doc)
204             elif start is not None:
205                 for frag in fragments.values():
206                     frag['content'].append(start.text)
207             elif end is not None:
208                 for frag in fragments.values():
209                     frag['content'].append(end.tail)
210
211         return header_docs + fragment_docs
212
213     def __enter__(self):
214         self.open()
215         return self
216
217     def __exit__(self, type, value, tb):
218         self.close()
219
220
221 class Search(IndexStore):
222     def __init__(self, default_field="content"):
223         IndexStore.__init__(self)
224         self.analyzer = PolishAnalyzer(Version.LUCENE_34)
225         ## self.analyzer = WLAnalyzer()
226         self.searcher = IndexSearcher(self.store, True)
227         self.parser = QueryParser(Version.LUCENE_34, default_field,
228                                   self.analyzer)
229
230         self.parent_filter = TermsFilter()
231         self.parent_filter.addTerm(Term("is_book", "true"))
232
233     def query(self, query):
234         return self.parser.parse(query)
235
236     def wrapjoins(self, query, fields=[]):
237         """
238         This functions modifies the query in a recursive way,
239         so Term and Phrase Queries contained, which match
240         provided fields are wrapped in a BlockJoinQuery,
241         and so delegated to children documents.
242         """
243         if BooleanQuery.instance_(query):
244             qs = BooleanQuery.cast_(query)
245             for clause in qs:
246                 clause = BooleanClause.cast_(clause)
247                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
248             return qs
249         else:
250             termset = HashSet()
251             query.extractTerms(termset)
252             for t in termset:
253                 t = Term.cast_(t)
254                 if t.field() not in fields:
255                     return query
256             return BlockJoinQuery(query, self.parent_filter,
257                                   BlockJoinQuery.ScoreMode.Total)
258
259     def simple_search(self, query, max_results=50):
260         """Returns (books, total_hits)
261         """
262
263         tops = self.searcher.search(self.query(query), max_results)
264         bks = []
265         for found in tops.scoreDocs:
266             doc = self.searcher.doc(found.doc)
267             bks.append(Book.objects.get(id=doc.get("book_id")))
268         return (bks, tops.totalHits)
269
270     def search(self, query, max_results=50):
271         query = self.query(query)
272         query = self.wrapjoins(query, ["content", "themes"])
273
274         tops = self.searcher.search(query, max_results)
275         bks = []
276         for found in tops.scoreDocs:
277             doc = self.searcher.doc(found.doc)
278             bks.append(Book.objects.get(id=doc.get("book_id")))
279         return (bks, tops.totalHits)
280
281     def bsearch(self, query, max_results=50):
282         q = self.query(query)
283         f = TermsFilter()
284         f.addTerm(Term("is_book", "true"))
285         bjq = BlockJoinQuery(q, f, BlockJoinQuery.ScoreMode.Avg)
286
287         tops = self.searcher.search(bjq, max_results)
288         bks = []
289         for found in tops.scoreDocs:
290             doc = self.searcher.doc(found.doc)
291             bks.append(Book.objects.get(id=doc.get("book_id")))
292         return (bks, tops.totalHits)