problems with snippets---outofmemoryexception
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
13     FuzzyQuery, FuzzyTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     initVM, CLASSPATH, JArray, JavaError
16     # KeywordAnalyzer
17
18 # Initialize jvm
19 JVM = initVM(classpath=CLASSPATH, maxheap=str(400*1024*1024))
20
21 import sys
22 import os
23 import re
24 import errno
25 from librarian import dcparser
26 from librarian.parser import WLDocument
27 import catalogue.models
28 from multiprocessing.pool import ThreadPool
29 from threading import current_thread
30 import atexit
31 import traceback
32
33
34 class WLAnalyzer(PerFieldAnalyzerWrapper):
35     def __init__(self):
36         polish = PolishAnalyzer(Version.LUCENE_34)
37         simple = SimpleAnalyzer(Version.LUCENE_34)
38         keyword = KeywordAnalyzer(Version.LUCENE_34)
39         # not sure if needed: there's NOT_ANALYZED meaning basically the same
40
41         PerFieldAnalyzerWrapper.__init__(self, polish)
42
43         self.addAnalyzer("tags", simple)
44         self.addAnalyzer("technical_editors", simple)
45         self.addAnalyzer("editors", simple)
46         self.addAnalyzer("url", keyword)
47         self.addAnalyzer("source_url", keyword)
48         self.addAnalyzer("source_name", simple)
49         self.addAnalyzer("publisher", simple)
50         self.addAnalyzer("author", simple)
51         self.addAnalyzer("is_book", keyword)
52
53         self.addAnalyzer("KEYWORD", keyword)
54         self.addAnalyzer("SIMPLE", simple)
55         self.addAnalyzer("POLISH", polish)
56
57
58 class IndexStore(object):
59     def __init__(self):
60         self.make_index_dir()
61         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
62
63     def make_index_dir(self):
64         try:
65             os.makedirs(settings.SEARCH_INDEX)
66         except OSError as exc:
67             if exc.errno == errno.EEXIST:
68                 pass
69             else: raise
70
71
72 class IndexChecker(IndexStore):
73     def __init__(self):
74         IndexStore.__init__(self)
75
76     def check(self):
77         checker = CheckIndex(self.store)
78         status = checker.checkIndex()
79         return status
80
81
82 class Snippets(object):
83     SNIPPET_DIR = "snippets"
84
85     def __init__(self, book_id):
86         try:
87             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
88         except OSError as exc:
89             if exc.errno == errno.EEXIST:
90                 pass
91             else: raise
92         self.book_id = book_id
93         self.file = None
94
95     def open(self, mode='r'):
96         if not 'b' in mode:
97             mode += 'b'
98         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
99         self.position = 0
100         return self
101
102     def add(self, snippet):
103         l = len(snippet)
104         self.file.write(snippet.encode('utf-8'))
105         pos = (self.position, l)
106         self.position += l
107         return pos
108
109     def get(self, pos):
110         self.file.seek(pos[0], 0)
111         return self.read(pos[1]).decode('utf-8')
112
113     def close(self):
114         self.file.close()
115
116
117 class Index(IndexStore):
118     def __init__(self, analyzer=None):
119         IndexStore.__init__(self)
120         self.index = None
121         if not analyzer:
122             analyzer = WLAnalyzer()
123         self.analyzer = analyzer
124
125     def open(self, analyzer=None):
126         if self.index:
127             raise Exception("Index is already opened")
128         self.index = IndexWriter(self.store, self.analyzer,\
129                                  IndexWriter.MaxFieldLength.LIMITED)
130         return self.index
131
132     def optimize(self):
133         self.index.optimize()
134
135     def close(self):
136         try:
137             self.index.optimize()
138         except JavaError, je:
139             print "Error during optimize phase, check index: %s" % je
140
141         self.index.close()
142         self.index = None
143
144     def remove_book(self, book):
145         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
146         self.index.deleteDocuments(q)
147
148     def index_book(self, book, overwrite=True):
149         if overwrite:
150             self.remove_book(book)
151
152         doc = self.extract_metadata(book)
153         parts = self.extract_content(book)
154         block = ArrayList().of_(Document)
155         
156         print "adding block."
157         for p in parts:
158             block.add(p)
159         block.add(doc)
160         self.index.addDocuments(block)
161         print "added."
162
163     master_tags = [
164         'opowiadanie',
165         'powiesc',
166         'dramat_wierszowany_l',
167         'dramat_wierszowany_lp',
168         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
169         'wywiad'
170         ]
171
172     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
173
174     def create_book_doc(self, book):
175         """
176         Create a lucene document connected to the book
177         """
178         doc = Document()
179         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
180         if book.parent is not None:
181             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
182         return doc
183
184     def extract_metadata(self, book):
185         book_info = dcparser.parse(book.xml_file)
186
187         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
188
189         doc = self.create_book_doc(book)
190         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
191         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
192         doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
193
194         # validator, name
195         for field in dcparser.BookInfo.FIELDS:
196             if hasattr(book_info, field.name):
197                 if not getattr(book_info, field.name):
198                     continue
199                 # since no type information is available, we use validator
200                 type_indicator = field.validator
201                 if type_indicator == dcparser.as_unicode:
202                     s = getattr(book_info, field.name)
203                     if field.multiple:
204                         s = ', '.join(s)
205                     try:
206                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
207                     except JavaError as je:
208                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
209                 elif type_indicator == dcparser.as_person:
210                     p = getattr(book_info, field.name)
211                     if isinstance(p, dcparser.Person):
212                         persons = unicode(p)
213                     else:
214                         persons = ', '.join(map(unicode, p))
215                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
216                 elif type_indicator == dcparser.as_date:
217                     dt = getattr(book_info, field.name)
218                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
219         return doc
220
221     def get_master(self, root):
222         for master in root.iter():
223             if master.tag in self.master_tags:
224                 return master
225
226     def extract_content(self, book):
227         wld = WLDocument.from_file(book.xml_file.path)
228         root = wld.edoc.getroot()
229
230         master = self.get_master(root)
231         if master is None:
232             return []
233
234         def walker(node):
235             yield node, None
236             for child in list(node):
237                 for b, e in walker(child):
238                     yield b, e
239             yield None, node
240             return
241
242         def fix_format(text):
243             return re.sub("/$", "", text, flags=re.M)
244
245         # header_type
246         # header_index
247         header_docs = []
248         # Then we create a document for each fragments
249         # fragment_anchor - the anchor
250         # themes - list of themes [not indexed]
251         fragment_docs = []
252         # will contain (framgent id -> { content: [], themes: [] }
253         fragments = {}
254         snippets = Snippets(book.id).open('w')
255         try:
256             for header, position in zip(list(master), range(len(master))):
257                 sys.stdout.write("\rsection: %d" % position)
258
259                 if header.tag in self.skip_header_tags:
260                     continue
261
262                 doc = self.create_book_doc(book)
263
264                 doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
265                 doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
266
267                 content = u' '.join([t for t in header.itertext()])
268                 content = fix_format(content)
269
270                 doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
271                 snip_pos = snippets.add(content)
272                 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
273                 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[0]))
274
275                 header_docs.append(doc)
276
277                 for start, end in walker(master):
278                     if start is not None and start.tag == 'begin':
279                         fid = start.attrib['id'][1:]
280                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
281                         fragments[fid]['content'].append(start.tail)
282                     elif start is not None and start.tag == 'motyw':
283                         fid = start.attrib['id'][1:]
284                         fragments[fid]['themes'].append(start.text)
285                         fragments[fid]['content'].append(start.tail)
286                     elif start is not None and start.tag == 'end':
287                         fid = start.attrib['id'][1:]
288                         if fid not in fragments:
289                             continue  # a broken <end> node, skip it
290                         frag = fragments[fid]
291                         del fragments[fid]
292
293                         def jstr(l):
294                             return u' '.join(map(
295                                 lambda x: x == None and u'(none)' or unicode(x),
296                                 l))
297
298                         doc = self.create_book_doc(book)
299
300                         doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
301                         doc.add(NumericField("header_span", Field.Store.YES, True).setIntValue(position - frag['start_section'] + 1))
302                         doc.add(Field("header_type", frag['start_header'], Field.Store.YES, Field.Index.NOT_ANALYZED))
303
304                         doc.add(Field("fragment_anchor", fid,
305                                       Field.Store.YES, Field.Index.NOT_ANALYZED))
306                         doc.add(Field("content",
307                                       u' '.join(filter(lambda s: s is not None, frag['content'])),
308                                       Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
309
310                         snip_pos = snippets.add(content)
311                         doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
312                         doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[0]))
313
314                         doc.add(Field("themes",
315                                       u' '.join(filter(lambda s: s is not None, frag['themes'])),
316                                       Field.Store.NO, Field.Index.ANALYZED))
317
318                         fragment_docs.append(doc)
319                     elif start is not None:
320                         for frag in fragments.values():
321                             frag['content'].append(start.text)
322                     elif end is not None:
323                         for frag in fragments.values():
324                             frag['content'].append(end.tail)
325         finally:
326             snippets.close()
327
328         return header_docs + fragment_docs
329
330     def __enter__(self):
331         self.open()
332         return self
333
334     def __exit__(self, type, value, tb):
335         self.close()
336
337
338 def log_exception_wrapper(f):
339     def _wrap(*a):
340         try:
341             f(*a)
342         except Exception, e:
343             print("Error in indexing thread: %s" % e)
344             traceback.print_exc()
345             raise e
346     return _wrap
347
348
349 class ReusableIndex(Index):
350     """
351     Works like index, but does not close/optimize Lucene index
352     until program exit (uses atexit hook).
353     This is usefull for importbooks command.
354
355     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
356     """
357     index = None
358     pool = None
359     pool_jobs = None
360
361     def open(self, analyzer=None, threads=4):
362         if ReusableIndex.index is not None:
363             self.index = ReusableIndex.index
364         else:
365             print("opening index")
366             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
367             ReusableIndex.pool_jobs = []
368             Index.open(self, analyzer)
369             ReusableIndex.index = self.index
370             atexit.register(ReusableIndex.close_reusable)
371
372     def index_book(self, *args, **kw):
373         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
374         ReusableIndex.pool_jobs.append(job)
375
376     @staticmethod
377     def close_reusable():
378         if ReusableIndex.index is not None:
379             print("wait for indexing to finish")
380             for job in ReusableIndex.pool_jobs:
381                 job.get()
382                 sys.stdout.write('.')
383                 sys.stdout.flush()
384             print("done.")
385             ReusableIndex.pool.close()
386
387             ReusableIndex.index.optimize()
388             ReusableIndex.index.close()
389             ReusableIndex.index = None
390
391     def close(self):
392         pass
393
394
395 class Search(IndexStore):
396     def __init__(self, default_field="content"):
397         IndexStore.__init__(self)
398         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
399         ## self.analyzer = WLAnalyzer()
400         self.searcher = IndexSearcher(self.store, True)
401         self.parser = QueryParser(Version.LUCENE_34, default_field,
402                                   self.analyzer)
403
404         self.parent_filter = TermsFilter()
405         self.parent_filter.addTerm(Term("is_book", "true"))
406
407     def query(self, query):
408         return self.parser.parse(query)
409
410     def wrapjoins(self, query, fields=[]):
411         """
412         This functions modifies the query in a recursive way,
413         so Term and Phrase Queries contained, which match
414         provided fields are wrapped in a BlockJoinQuery,
415         and so delegated to children documents.
416         """
417         if BooleanQuery.instance_(query):
418             qs = BooleanQuery.cast_(query)
419             for clause in qs:
420                 clause = BooleanClause.cast_(clause)
421                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
422             return qs
423         else:
424             termset = HashSet()
425             query.extractTerms(termset)
426             for t in termset:
427                 t = Term.cast_(t)
428                 if t.field() not in fields:
429                     return query
430             return BlockJoinQuery(query, self.parent_filter,
431                                   BlockJoinQuery.ScoreMode.Total)
432
433     def simple_search(self, query, max_results=50):
434         """Returns (books, total_hits)
435         """
436
437         tops = self.searcher.search(self.query(query), max_results)
438         bks = []
439         for found in tops.scoreDocs:
440             doc = self.searcher.doc(found.doc)
441             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
442         return (bks, tops.totalHits)
443
444     def search(self, query, max_results=50):
445         query = self.query(query)
446         query = self.wrapjoins(query, ["content", "themes"])
447
448         tops = self.searcher.search(query, max_results)
449         bks = []
450         for found in tops.scoreDocs:
451             doc = self.searcher.doc(found.doc)
452             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
453         return (bks, tops.totalHits)
454
455     def bsearch(self, query, max_results=50):
456         q = self.query(query)
457         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
458
459         tops = self.searcher.search(bjq, max_results)
460         bks = []
461         for found in tops.scoreDocs:
462             doc = self.searcher.doc(found.doc)
463             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
464         return (bks, tops.totalHits)
465
466 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
467 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
468 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
469
470 # while (tokenStream.incrementToken()) {
471 #     int startOffset = offsetAttribute.startOffset();
472 #     int endOffset = offsetAttribute.endOffset();
473 #     String term = charTermAttribute.toString();
474 # }
475
476
477 class SearchResult(object):
478     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets_cb=None):
479         if score:
480             self.score = score
481         else:
482             self.score = scoreDocs.score
483
484         self.hits = []
485
486         stored = searcher.doc(scoreDocs.doc)
487         self.book_id = int(stored.get("book_id"))
488
489         header_type = stored.get("header_type")
490         sec = (header_type, int(stored.get("header_index")))
491         header_span = stored.get('header_span')
492         header_span = header_span is not None and int(header_span) or 1
493         stored = searcher.doc(scoreDocs.doc)
494         self.book_id = int(stored.get("book_id"))
495
496         fragment = stored.get("fragment_anchor")
497
498         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets_cb': snippets_cb})
499
500         self.hits.append(hit)
501
502     def merge(self, other):
503         if self.book_id != other.book_id:
504             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
505         self.hits += other.hits
506         if other.score > self.score:
507             self.score = other.score
508         return self
509
510     def add_snippets(self, snippets):
511         self.snippets += snippets
512         return self
513
514     def get_book(self):
515         return catalogue.models.Book.objects.get(id=self.book_id)
516
517     book = property(get_book)
518
519     def get_parts(self):
520         book = self.book
521
522         def sections_covered(results):
523             frags = filter(lambda r: r[1] is not None, results)
524             sect = filter(lambda r: r[1] is None, results)
525             sect = filter(lambda s: 0 == len(filter(
526                 lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
527                 frags)), sect)
528             print "filtered, non overlapped sections: %s" % sect
529             return frags + sect
530             
531             
532         parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
533             + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
534
535         parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
536         print("bookid: %d parts: %s" % (self.book_id, parts))
537         return parts
538
539     parts = property(get_parts)
540
541
542     def __unicode__(self):
543         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
544
545     @staticmethod
546     def aggregate(*result_lists):
547         books = {}
548         for rl in result_lists:
549             for r in rl:
550                 if r.book_id in books:
551                     books[r.book_id].merge(r)
552                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
553                 else:
554                     books[r.book_id] = r
555         return books.values()
556
557     def __cmp__(self, other):
558         return cmp(self.score, other.score)
559
560
561 class MultiSearch(Search):
562     """Class capable of IMDb-like searching"""
563     def get_tokens(self, searched, field='content'):
564         """returns tokens analyzed by a proper (for a field) analyzer
565         argument can be: StringReader, string/unicode, or tokens. In the last case
566         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
567         """
568         if isinstance(searched, str) or isinstance(searched, unicode):
569             searched = StringReader(searched)
570         elif isinstance(searched, list):
571             return searched
572
573         searched.reset()
574         tokens = self.analyzer.reusableTokenStream(field, searched)
575         toks = []
576         while tokens.incrementToken():
577             cta = tokens.getAttribute(CharTermAttribute.class_)
578             toks.append(cta.toString())
579         return toks
580
581     def fuzziness(self, fuzzy):
582         if not fuzzy:
583             return None
584         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
585             return fuzzy
586         else:
587             return 0.5
588
589     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
590         if fuzzy:
591             phrase = MultiPhraseQuery()
592             for t in tokens:
593                 term = Term(field, t)
594                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
595                 fuzzterms = []
596
597                 while True:
598                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
599                     ft = fuzzterm.term()
600                     if ft:
601                         fuzzterms.append(ft)
602                     if not fuzzterm.next(): break
603                 if fuzzterms:
604                     phrase.add(JArray('object')(fuzzterms, Term))
605                 else:
606                     phrase.add(term)
607         else:
608             phrase = PhraseQuery()
609             phrase.setSlop(slop)
610             for t in tokens:
611                 term = Term(field, t)
612                 phrase.add(term)
613         return phrase
614
615     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
616         q = BooleanQuery()
617         for t in tokens:
618             term = Term(field, t)
619             if fuzzy:
620                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
621             else:
622                 term = TermQuery(term)
623             q.add(BooleanClause(term, modal))
624         return q
625
626     def content_query(self, query):
627         return BlockJoinQuery(query, self.parent_filter,
628                               BlockJoinQuery.ScoreMode.Total)
629
630     def search_perfect_book(self, searched, max_results=20, fuzzy=False):
631         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in ['author', 'title']]
632
633         books = []
634         for q in qrys:
635             top = self.searcher.search(q, max_results)
636             for found in top.scoreDocs:
637                 books.append(SearchResult(self.searcher, found))
638         return books
639
640     def search_perfect_parts(self, searched, max_results=20, fuzzy=False):
641         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
642
643         books = []
644         for q in qrys:
645             top = self.searcher.search(q, max_results)
646             for found in top.scoreDocs:
647                 books.append(SearchResult(self.searcher, found).add_snippets(self.get_snippets(found, q)))
648
649         return books
650
651     def search_everywhere(self, searched, max_results=20, fuzzy=False):
652         books = []
653
654         # content only query : themes x content
655         q = BooleanQuery()
656
657         tokens = self.get_tokens(searched)
658         q.add(BooleanClause(self.make_term_query(tokens, field='themes', fuzzy=fuzzy), BooleanClause.Occur.MUST))
659         q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
660
661         topDocs = self.searcher.search(q, max_results)
662         for found in topDocs.scoreDocs:
663             books.append(SearchResult(self.searcher, found))
664
665         # joined query themes/content x author/title/epochs/genres/kinds
666         q = BooleanQuery()
667         in_meta = BooleanQuery()
668         in_content = BooleanQuery()
669
670         for fld in ['themes', 'content']:
671             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
672
673         in_meta.add(BooleanClause(self.make_term_query(
674             self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
675
676         for fld in ['title', 'epochs', 'genres', 'kinds']:
677             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
678
679         q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
680         in_content_join = self.content_query(in_content)
681         q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
682         #        import pdb; pdb.set_trace()
683         collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
684
685         self.searcher.search(q, collector)
686
687         top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
688         if top_groups:
689             for grp in top_groups.groups:
690                 for part in grp.scoreDocs:
691                     books.append(SearchResult(self.searcher, part, score=grp.maxScore))
692         return books
693
694     def multisearch(self, query, max_results=50):
695         """
696         Search strategy:
697         - (phrase) OR -> content
698                       -> title
699                       -> author
700         - (keywords)  -> author
701                       -> motyw
702                       -> tags
703                       -> content
704         """
705         # queryreader = StringReader(query)
706         # tokens = self.get_tokens(queryreader)
707
708         # top_level = BooleanQuery()
709         # Should = BooleanClause.Occur.SHOULD
710
711         # phrase_level = BooleanQuery()
712         # phrase_level.setBoost(1.3)
713
714         # p_content = self.make_phrase(tokens, joined=True)
715         # p_title = self.make_phrase(tokens, 'title')
716         # p_author = self.make_phrase(tokens, 'author')
717
718         # phrase_level.add(BooleanClause(p_content, Should))
719         # phrase_level.add(BooleanClause(p_title, Should))
720         # phrase_level.add(BooleanClause(p_author, Should))
721
722         # kw_level = BooleanQuery()
723
724         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
725         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
726         # kw_level.add(j_themes, Should)
727         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
728         # j_con = self.make_term_query(tokens, joined=True)
729         # kw_level.add(j_con, Should)
730
731         # top_level.add(BooleanClause(phrase_level, Should))
732         # top_level.add(BooleanClause(kw_level, Should))
733
734         return None
735
736     def do_search(self, query, max_results=50, collector=None):
737         tops = self.searcher.search(query, max_results)
738         #tops = self.searcher.search(p_content, max_results)
739
740         bks = []
741         for found in tops.scoreDocs:
742             doc = self.searcher.doc(found.doc)
743             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
744             bks.append(b)
745             print "%s (%d) -> %f" % (b, b.id, found.score)
746         return (bks, tops.totalHits)
747
748     def get_snippets(self, scoreDoc, query, field='content'):
749         htmlFormatter = SimpleHTMLFormatter()
750         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
751
752         stored = self.searcher.doc(scoreDoc.doc)
753
754         # locate content.
755         snippets = Snippets(stored.get('book_id')).open()
756         try:
757             text = snippets.get(stored.get('snippets_position'), stored.get('snippets_length'))
758         finally:
759             snippets.close()
760
761         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
762         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
763         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
764         print('snips: %s' % snip)
765
766         return [snip]