33f2aa1aa6d441da403b53b60e45e105e7944d8c
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2 from django.conf import settings
3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
4     NumericField, Version, Document, JavaError, IndexSearcher, \
5     QueryParser, Term, PerFieldAnalyzerWrapper, \
6     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
7     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
8     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
9     HashSet, BooleanClause, Term, CharTermAttribute, \
10     PhraseQuery, StringReader
11     # KeywordAnalyzer
12 import os
13 import errno
14 from librarian import dcparser
15 from librarian.parser import WLDocument
16 import catalogue.models
17 import atexit
18
19
20 class WLAnalyzer(PerFieldAnalyzerWrapper):
21     def __init__(self):
22         polish = PolishAnalyzer(Version.LUCENE_34)
23         simple = SimpleAnalyzer(Version.LUCENE_34)
24         keyword = KeywordAnalyzer(Version.LUCENE_34)
25         # not sure if needed: there's NOT_ANALYZED meaning basically the same
26
27         PerFieldAnalyzerWrapper.__init__(self, polish)
28
29         self.addAnalyzer("tags", simple)
30         self.addAnalyzer("technical_editors", simple)
31         self.addAnalyzer("editors", simple)
32         self.addAnalyzer("url", keyword)
33         self.addAnalyzer("source_url", keyword)
34         self.addAnalyzer("source_name", simple)
35         self.addAnalyzer("publisher", simple)
36         self.addAnalyzer("author", simple)
37         self.addAnalyzer("is_book", keyword)
38
39         #self.addanalyzer("fragment_anchor", keyword)
40
41
42 class IndexStore(object):
43     def __init__(self):
44         self.make_index_dir()
45         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
46
47     def make_index_dir(self):
48         try:
49             os.makedirs(settings.SEARCH_INDEX)
50         except OSError as exc:
51             if exc.errno == errno.EEXIST:
52                 pass
53             else: raise
54
55
56 class Index(IndexStore):
57     def __init__(self, analyzer=None):
58         IndexStore.__init__(self)
59         self.index = None
60         if not analyzer:
61             analyzer = WLAnalyzer()
62         self.analyzer = analyzer
63
64     def open(self, analyzer=None):
65         if self.index:
66             raise Exception("Index is already opened")
67         self.index = IndexWriter(self.store, self.analyzer,\
68                                  IndexWriter.MaxFieldLength.LIMITED)
69         return self.index
70
71     def close(self):
72         self.index.optimize()
73         self.index.close()
74         self.index = None
75
76     def remove_book(self, book):
77         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
78         self.index.deleteDocuments(q)
79
80     def index_book(self, book, overwrite=True):
81         if overwrite:
82             self.remove_book(book)
83
84         doc = self.extract_metadata(book)
85         parts = self.extract_content(book)
86         block = ArrayList().of_(Document)
87
88         for p in parts:
89             block.add(p)
90         block.add(doc)
91         self.index.addDocuments(block)
92
93     master_tags = [
94         'opowiadanie',
95         'powiesc',
96         'dramat_wierszowany_l',
97         'dramat_wierszowany_lp',
98         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
99         'wywiad'
100         ]
101
102     skip_header_tags = ['autor_utworu', 'nazwa_utworu']
103
104     def create_book_doc(self, book):
105         """
106         Create a lucene document connected to the book
107         """
108         doc = Document()
109         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
110         if book.parent is not None:
111             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
112         return doc
113
114     def extract_metadata(self, book):
115         book_info = dcparser.parse(book.xml_file)
116
117         doc = self.create_book_doc(book)
118         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
119         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
120         doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
121
122         # validator, name
123         for field in dcparser.BookInfo.FIELDS:
124             if hasattr(book_info, field.name):
125                 if not getattr(book_info, field.name):
126                     continue
127                 # since no type information is available, we use validator
128                 type_indicator = field.validator
129                 if type_indicator == dcparser.as_unicode:
130                     s = getattr(book_info, field.name)
131                     if field.multiple:
132                         s = ', '.join(s)
133                     try:
134                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
135                     except JavaError as je:
136                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
137                 elif type_indicator == dcparser.as_person:
138                     p = getattr(book_info, field.name)
139                     if isinstance(p, dcparser.Person):
140                         persons = unicode(p)
141                     else:
142                         persons = ', '.join(map(unicode, p))
143                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
144                 elif type_indicator == dcparser.as_date:
145                     dt = getattr(book_info, field.name)
146                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
147         return doc
148
149     def get_master(self, root):
150         for master in root.iter():
151             if master.tag in self.master_tags:
152                 return master
153
154     
155     def extract_content(self, book):
156         wld = WLDocument.from_file(book.xml_file.path)
157         root = wld.edoc.getroot()
158
159         # first we build a sequence of top-level items.
160         # book_id
161         # header_index - the 0-indexed position of header element.
162         # content
163         master = self.get_master(root)
164         if master is None:
165             return []
166         
167         header_docs = []
168         for header, position in zip(list(master), range(len(master))):
169             if header.tag in self.skip_header_tags:
170                 continue
171             doc = self.create_book_doc(book)
172             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
173             doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
174             content = u' '.join([t for t in header.itertext()])
175             doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
176             header_docs.append(doc)
177
178         def walker(node):
179             yield node, None
180             for child in list(node):
181                 for b, e in walker(child):
182                     yield b, e
183             yield None, node
184             return
185
186         # Then we create a document for each fragments
187         # fragment_anchor - the anchor
188         # themes - list of themes [not indexed]
189         fragment_docs = []
190         # will contain (framgent id -> { content: [], themes: [] }
191         fragments = {}
192         for start, end in walker(master):
193             if start is not None and start.tag == 'begin':
194                 fid = start.attrib['id'][1:]
195                 fragments[fid] = {'content': [], 'themes': []}
196                 fragments[fid]['content'].append(start.tail)
197             elif start is not None and start.tag == 'motyw':
198                 fid = start.attrib['id'][1:]
199                 fragments[fid]['themes'].append(start.text)
200                 fragments[fid]['content'].append(start.tail)
201             elif start is not None and start.tag == 'end':
202                 fid = start.attrib['id'][1:]
203                 if fid not in fragments:
204                     continue  # a broken <end> node, skip it
205                 frag = fragments[fid]
206                 del fragments[fid]
207
208                 def jstr(l):
209                     return u' '.join(map(
210                         lambda x: x == None and u'(none)' or unicode(x),
211                         l))
212
213                 doc = self.create_book_doc(book)
214                 doc.add(Field("fragment_anchor", fid,
215                               Field.Store.YES, Field.Index.NOT_ANALYZED))
216                 doc.add(Field("content",
217                               u' '.join(filter(lambda s: s is not None, frag['content'])),
218                               Field.Store.NO, Field.Index.ANALYZED))
219                 doc.add(Field("themes",
220                               u' '.join(filter(lambda s: s is not None, frag['themes'])),
221                               Field.Store.NO, Field.Index.ANALYZED))
222
223                 fragment_docs.append(doc)
224             elif start is not None:
225                 for frag in fragments.values():
226                     frag['content'].append(start.text)
227             elif end is not None:
228                 for frag in fragments.values():
229                     frag['content'].append(end.tail)
230
231         return header_docs + fragment_docs
232
233     def __enter__(self):
234         self.open()
235         return self
236
237     def __exit__(self, type, value, tb):
238         self.close()
239
240
241 class ReusableIndex(Index):
242     """
243     Works like index, but does not close/optimize Lucene index
244     until program exit (uses atexit hook).
245     This is usefull for importbooks command.
246
247     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
248     """
249     index = None
250     def open(self, analyzer=None):
251         if ReusableIndex.index is not None:
252             self.index = ReusableIndex.index
253         else:
254             Index.open(self,analyzer)
255             ReusableIndex.index = self.index
256             atexit.register(ReusableIndex.close_reusable)
257
258     @staticmethod
259     def close_reusable():
260         if ReusableIndex.index is not None:
261             ReusableIndex.index.optimize()
262             ReusableIndex.index.close()
263             ReusableIndex.index = None
264             
265     def close(self):
266         pass
267
268 class Search(IndexStore):
269     def __init__(self, default_field="content"):
270         IndexStore.__init__(self)
271         self.analyzer = PolishAnalyzer(Version.LUCENE_34)
272         ## self.analyzer = WLAnalyzer()
273         self.searcher = IndexSearcher(self.store, True)
274         self.parser = QueryParser(Version.LUCENE_34, default_field,
275                                   self.analyzer)
276
277         self.parent_filter = TermsFilter()
278         self.parent_filter.addTerm(Term("is_book", "true"))
279
280     def query(self, query):
281         return self.parser.parse(query)
282
283     def wrapjoins(self, query, fields=[]):
284         """
285         This functions modifies the query in a recursive way,
286         so Term and Phrase Queries contained, which match
287         provided fields are wrapped in a BlockJoinQuery,
288         and so delegated to children documents.
289         """
290         if BooleanQuery.instance_(query):
291             qs = BooleanQuery.cast_(query)
292             for clause in qs:
293                 clause = BooleanClause.cast_(clause)
294                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
295             return qs
296         else:
297             termset = HashSet()
298             query.extractTerms(termset)
299             for t in termset:
300                 t = Term.cast_(t)
301                 if t.field() not in fields:
302                     return query
303             return BlockJoinQuery(query, self.parent_filter,
304                                   BlockJoinQuery.ScoreMode.Total)
305
306     def simple_search(self, query, max_results=50):
307         """Returns (books, total_hits)
308         """
309
310         tops = self.searcher.search(self.query(query), max_results)
311         bks = []
312         for found in tops.scoreDocs:
313             doc = self.searcher.doc(found.doc)
314             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
315         return (bks, tops.totalHits)
316
317     def search(self, query, max_results=50):
318         query = self.query(query)
319         query = self.wrapjoins(query, ["content", "themes"])
320
321         tops = self.searcher.search(query, max_results)
322         bks = []
323         for found in tops.scoreDocs:
324             doc = self.searcher.doc(found.doc)
325             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
326         return (bks, tops.totalHits)
327
328     def bsearch(self, query, max_results=50):
329         q = self.query(query)
330         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
331
332         tops = self.searcher.search(bjq, max_results)
333         bks = []
334         for found in tops.scoreDocs:
335             doc = self.searcher.doc(found.doc)
336             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
337         return (bks, tops.totalHits)
338
339 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
340 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
341 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
342
343 # while (tokenStream.incrementToken()) {
344 #     int startOffset = offsetAttribute.startOffset();
345 #     int endOffset = offsetAttribute.endOffset();
346 #     String term = charTermAttribute.toString();
347 # }
348
349
350 class MultiSearch(Search):
351     """Class capable of IMDb-like searching"""
352     def get_tokens(self, queryreader):
353         if isinstance(queryreader, str):
354             queryreader = StringReader(queryreader)
355         queryreader.reset()
356         tokens = self.analyzer.reusableTokenStream('content', queryreader)
357         toks = []
358         while tokens.incrementToken():
359             cta = tokens.getAttribute(CharTermAttribute.class_)
360             toks.append(cta)
361         return toks
362
363     def make_phrase(self, tokens, field='content', joined=False):
364         phrase = PhraseQuery()
365         for t in tokens:
366             term = Term(field, t)
367             phrase.add(term)
368         if joined:
369             phrase = self.content_query(phrase)
370         return phrase
371
372     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, joined=False):
373         q = BooleanQuery()
374         for t in tokens:
375             term = Term(field, t)
376             q.add(BooleanClause(term, modal))
377         if joined:
378             self.content_query(q)
379         return q
380
381     def content_query(self, query):
382         return BlockJoinQuery(query, self.parent_filter,
383                               BlockJoinQuery.ScoreMode.Total)
384
385     def multiseach(self, query, max_results=50):
386         """
387         Search strategy:
388         - (phrase) OR -> content
389                       -> title
390                       -> author
391         - (keywords)  -> author
392                       -> motyw
393                       -> tags
394                       -> content
395         """
396         queryreader = StringReader(query)
397         tokens = self.get_tokens(queryreader)
398
399         top_level = BooleanQuery()
400         Should = BooleanClause.Occur.SHOULD
401
402         phrase_level = BooleanQuery()
403
404         p_content = self.make_phrase(tokens, joined=True)
405         p_title = self.make_phrase(tokens, 'title')
406         p_author = self.make_phrase(tokens, 'author')
407
408         phrase_level.add(BooleanClause(p_content, Should))
409         phrase_level.add(BooleanClause(p_title, Should))
410         phrase_level.add(BooleanClause(p_author, Should))
411
412         kw_level = BooleanQuery()
413
414         kw_level.add(self.make_term_query(tokens, 'author'), Should)
415         kw_level.add(self.make_term_query(tokens, 'themes', joined=True), Should)
416         kw_level.add(self.make_term_query(tokens, 'tags'), Should)
417         kw_level.add(self.make_term_query(tokens, joined=True), Should)
418
419         top_level.add(BooleanClause(phrase_level, Should))
420         top_level.add(BooleanClause(kw_level, Should))
421
422         tops = self.searcher.search(top_level, max_results)
423         bks = []
424         for found in tops.scoreDocs:
425             doc = self.searcher.doc(found.doc)
426             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
427         return (bks, tops.totalHits)