25c16a96a9958e031f3d2465895817a4a6da2dff
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from catalogue import models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         polish_gap = PolishAnalyzer(Version.LUCENE_34)
39         #        polish_gap.setPositionIncrementGap(999)
40
41         simple = SimpleAnalyzer(Version.LUCENE_34)
42         simple_gap = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple_gap)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("author", simple)
59         self.addAnalyzer("is_book", keyword)
60
61         self.addAnalyzer("themes", simple_gap)
62         self.addAnalyzer("themes_pl", polish_gap)
63
64         self.addAnalyzer("tag_name", simple_gap)
65         self.addAnalyzer("tag_name_pl", polish_gap)
66
67         self.addAnalyzer("KEYWORD", keyword)
68         self.addAnalyzer("SIMPLE", simple)
69         self.addAnalyzer("POLISH", polish)
70
71
72 class IndexStore(object):
73     def __init__(self):
74         self.make_index_dir()
75         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
76
77     def make_index_dir(self):
78         try:
79             os.makedirs(settings.SEARCH_INDEX)
80         except OSError as exc:
81             if exc.errno == errno.EEXIST:
82                 pass
83             else: raise
84
85
86 class IndexChecker(IndexStore):
87     def __init__(self):
88         IndexStore.__init__(self)
89
90     def check(self):
91         checker = CheckIndex(self.store)
92         status = checker.checkIndex()
93         return status
94
95
96 class Snippets(object):
97     SNIPPET_DIR = "snippets"
98
99     def __init__(self, book_id):
100         try:
101             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
102         except OSError as exc:
103             if exc.errno == errno.EEXIST:
104                 pass
105             else: raise
106         self.book_id = book_id
107         self.file = None
108
109     def open(self, mode='r'):
110         if not 'b' in mode:
111             mode += 'b'
112         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
113         self.position = 0
114         return self
115
116     def add(self, snippet):
117         l = len(snippet)
118         self.file.write(snippet.encode('utf-8'))
119         pos = (self.position, l)
120         self.position += l
121         return pos
122
123     def get(self, pos):
124         self.file.seek(pos[0], 0)
125         return self.read(pos[1]).decode('utf-8')
126
127     def close(self):
128         self.file.close()
129
130
131 class Index(IndexStore):
132     def __init__(self, analyzer=None):
133         IndexStore.__init__(self)
134         self.index = None
135         if not analyzer:
136             analyzer = WLAnalyzer()
137         self.analyzer = analyzer
138
139     def open(self, analyzer=None):
140         if self.index:
141             raise Exception("Index is already opened")
142         self.index = IndexWriter(self.store, self.analyzer,\
143                                  IndexWriter.MaxFieldLength.LIMITED)
144         return self.index
145
146     def optimize(self):
147         self.index.optimize()
148
149     def close(self):
150         try:
151             self.index.optimize()
152         except JavaError, je:
153             print "Error during optimize phase, check index: %s" % je
154
155         self.index.close()
156         self.index = None
157
158     def index_tags(self):
159         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
160         self.index.deleteDocuments(q)
161
162         for tag in models.Tag.objects.all():
163             doc = Document()
164             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
165             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
166             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
167             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
168             self.index.addDocument(doc)
169
170     def remove_book(self, book):
171         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
172         self.index.deleteDocuments(q)
173
174     def index_book(self, book, overwrite=True):
175         if overwrite:
176             self.remove_book(book)
177
178         book_doc = self.create_book_doc(book)
179         meta_fields = self.extract_metadata(book)
180         for f in meta_fields.values():
181             if isinstance(f, list):
182                 for elem in f:
183                     book_doc.add(elem)
184             else:
185                 book_doc.add(f)
186                 
187         self.index.addDocument(book_doc)
188         del book_doc
189
190         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['author']])
191
192     master_tags = [
193         'opowiadanie',
194         'powiesc',
195         'dramat_wierszowany_l',
196         'dramat_wierszowany_lp',
197         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
198         'wywiad'
199         ]
200
201     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
202
203     def create_book_doc(self, book):
204         """
205         Create a lucene document connected to the book
206         """
207         doc = Document()
208         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
209         if book.parent is not None:
210             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
211         return doc
212
213     def extract_metadata(self, book):
214         fields = {}
215         book_info = dcparser.parse(book.xml_file)
216
217         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
218
219         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
220         fields['tags'] = [Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags]
221         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
222
223         # validator, name
224         for field in dcparser.BookInfo.FIELDS:
225             if hasattr(book_info, field.name):
226                 if not getattr(book_info, field.name):
227                     continue
228                 # since no type information is available, we use validator
229                 type_indicator = field.validator
230                 if type_indicator == dcparser.as_unicode:
231                     s = getattr(book_info, field.name)
232                     if field.multiple:
233                         s = ', '.join(s)
234                     try:
235                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
236                     except JavaError as je:
237                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
238                 elif type_indicator == dcparser.as_person:
239                     p = getattr(book_info, field.name)
240                     if isinstance(p, dcparser.Person):
241                         persons = unicode(p)
242                     else:
243                         persons = ', '.join(map(unicode, p))
244                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
245                 elif type_indicator == dcparser.as_date:
246                     dt = getattr(book_info, field.name)
247                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
248                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
249         return fields
250
251     def get_master(self, root):
252         for master in root.iter():
253             if master.tag in self.master_tags:
254                 return master
255
256     def index_content(self, book, book_fields=[]):
257         wld = WLDocument.from_file(book.xml_file.path)
258         root = wld.edoc.getroot()
259
260         master = self.get_master(root)
261         if master is None:
262             return []
263
264         def walker(node):
265             yield node, None
266             for child in list(node):
267                 for b, e in walker(child):
268                     yield b, e
269             yield None, node
270             return
271
272         def fix_format(text):
273             return re.sub("/$", "", text, flags=re.M)
274
275         def add_part(snippets, **fields):
276             doc = self.create_book_doc(book)
277             for f in book_fields:
278                 doc.add(f)
279
280             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
281             doc.add(NumericField("header_span", Field.Store.YES, True)\
282                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
283             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
284
285             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
286                           Field.TermVector.WITH_POSITIONS_OFFSETS))
287
288             snip_pos = snippets.add(content)
289             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
290             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
291
292             if 'fragment_anchor' in fields:
293                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
294                               Field.Store.YES, Field.Index.NOT_ANALYZED))
295                 
296             if 'themes' in fields:
297                 for theme in fields['themes']:
298                     doc.add(Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
299                     doc.add(Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
300
301             return doc
302
303         fragments = {}
304         snippets = Snippets(book.id).open('w')
305         try:
306             for header, position in zip(list(master), range(len(master))):
307
308                 if header.tag in self.skip_header_tags:
309                     continue
310
311                 content = u' '.join([t for t in header.itertext()])
312                 content = fix_format(content)
313
314                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
315
316                 self.index.addDocument(doc)
317
318                 for start, end in walker(master):
319                     if start is not None and start.tag == 'begin':
320                         fid = start.attrib['id'][1:]
321                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
322                         fragments[fid]['content'].append(start.tail)
323                     elif start is not None and start.tag == 'motyw':
324                         fid = start.attrib['id'][1:]
325                         fragments[fid]['themes'].append(start.text)
326                         fragments[fid]['content'].append(start.tail)
327                     elif start is not None and start.tag == 'end':
328                         fid = start.attrib['id'][1:]
329                         if fid not in fragments:
330                             continue  # a broken <end> node, skip it
331                         frag = fragments[fid]
332                         del fragments[fid]
333
334                         def jstr(l):
335                             return u' '.join(map(
336                                 lambda x: x == None and u'(none)' or unicode(x),
337                                 l))
338
339                         doc = add_part(snippets,
340                                        header_type=frag['start_header'],
341                                        header_index=frag['start_section'],
342                                        header_span=position - frag['start_section'] + 1,
343                                        fragment_anchor=fid,
344                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
345                                        themes=frag['themes'])
346
347                         self.index.addDocument(doc)
348                     elif start is not None:
349                         for frag in fragments.values():
350                             frag['content'].append(start.text)
351                     elif end is not None:
352                         for frag in fragments.values():
353                             frag['content'].append(end.tail)
354         finally:
355             snippets.close()
356
357
358     def __enter__(self):
359         self.open()
360         return self
361
362     def __exit__(self, type, value, tb):
363         self.close()
364
365
366 def log_exception_wrapper(f):
367     def _wrap(*a):
368         try:
369             f(*a)
370         except Exception, e:
371             print("Error in indexing thread: %s" % e)
372             traceback.print_exc()
373             raise e
374     return _wrap
375
376
377 class ReusableIndex(Index):
378     """
379     Works like index, but does not close/optimize Lucene index
380     until program exit (uses atexit hook).
381     This is usefull for importbooks command.
382
383     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
384     """
385     index = None
386     pool = None
387     pool_jobs = None
388
389     def open(self, analyzer=None, threads=4):
390         if ReusableIndex.index is not None:
391             self.index = ReusableIndex.index
392         else:
393             print("opening index")
394             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
395             ReusableIndex.pool_jobs = []
396             Index.open(self, analyzer)
397             ReusableIndex.index = self.index
398             atexit.register(ReusableIndex.close_reusable)
399
400     def index_book(self, *args, **kw):
401         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
402         ReusableIndex.pool_jobs.append(job)
403
404     @staticmethod
405     def close_reusable():
406         if ReusableIndex.index is not None:
407             print("wait for indexing to finish")
408             for job in ReusableIndex.pool_jobs:
409                 job.get()
410                 sys.stdout.write('.')
411                 sys.stdout.flush()
412             print("done.")
413             ReusableIndex.pool.close()
414
415             ReusableIndex.index.optimize()
416             ReusableIndex.index.close()
417             ReusableIndex.index = None
418
419     def close(self):
420         pass
421
422
423 class Search(IndexStore):
424     def __init__(self, default_field="content"):
425         IndexStore.__init__(self)
426         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
427         ## self.analyzer = WLAnalyzer()
428         self.searcher = IndexSearcher(self.store, True)
429         self.parser = QueryParser(Version.LUCENE_34, default_field,
430                                   self.analyzer)
431
432         self.parent_filter = TermsFilter()
433         self.parent_filter.addTerm(Term("is_book", "true"))
434
435     def query(self, query):
436         return self.parser.parse(query)
437
438     def wrapjoins(self, query, fields=[]):
439         """
440         This functions modifies the query in a recursive way,
441         so Term and Phrase Queries contained, which match
442         provided fields are wrapped in a BlockJoinQuery,
443         and so delegated to children documents.
444         """
445         if BooleanQuery.instance_(query):
446             qs = BooleanQuery.cast_(query)
447             for clause in qs:
448                 clause = BooleanClause.cast_(clause)
449                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
450             return qs
451         else:
452             termset = HashSet()
453             query.extractTerms(termset)
454             for t in termset:
455                 t = Term.cast_(t)
456                 if t.field() not in fields:
457                     return query
458             return BlockJoinQuery(query, self.parent_filter,
459                                   BlockJoinQuery.ScoreMode.Total)
460
461     def simple_search(self, query, max_results=50):
462         """Returns (books, total_hits)
463         """
464
465         tops = self.searcher.search(self.query(query), max_results)
466         bks = []
467         for found in tops.scoreDocs:
468             doc = self.searcher.doc(found.doc)
469             bks.append(models.Book.objects.get(id=doc.get("book_id")))
470         return (bks, tops.totalHits)
471
472     def search(self, query, max_results=50):
473         query = self.query(query)
474         query = self.wrapjoins(query, ["content", "themes"])
475
476         tops = self.searcher.search(query, max_results)
477         bks = []
478         for found in tops.scoreDocs:
479             doc = self.searcher.doc(found.doc)
480             bks.append(models.Book.objects.get(id=doc.get("book_id")))
481         return (bks, tops.totalHits)
482
483     def bsearch(self, query, max_results=50):
484         q = self.query(query)
485         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
486
487         tops = self.searcher.search(bjq, max_results)
488         bks = []
489         for found in tops.scoreDocs:
490             doc = self.searcher.doc(found.doc)
491             bks.append(models.Book.objects.get(id=doc.get("book_id")))
492         return (bks, tops.totalHits)
493
494 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
495 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
496 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
497
498 # while (tokenStream.incrementToken()) {
499 #     int startOffset = offsetAttribute.startOffset();
500 #     int endOffset = offsetAttribute.endOffset();
501 #     String term = charTermAttribute.toString();
502 # }
503
504
505 class SearchResult(object):
506     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets_cb=None):
507         if score:
508             self.score = score
509         else:
510             self.score = scoreDocs.score
511
512         self.hits = []
513
514         stored = searcher.doc(scoreDocs.doc)
515         self.book_id = int(stored.get("book_id"))
516
517         header_type = stored.get("header_type")
518         sec = (header_type, int(stored.get("header_index")))
519         header_span = stored.get('header_span')
520         header_span = header_span is not None and int(header_span) or 1
521         stored = searcher.doc(scoreDocs.doc)
522         self.book_id = int(stored.get("book_id"))
523
524         fragment = stored.get("fragment_anchor")
525
526         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets_cb': snippets_cb})
527
528         self.hits.append(hit)
529
530     def merge(self, other):
531         if self.book_id != other.book_id:
532             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
533         self.hits += other.hits
534         if other.score > self.score:
535             self.score = other.score
536         return self
537
538     def add_snippets(self, snippets):
539         self.snippets += snippets
540         return self
541
542     def get_book(self):
543         return models.Book.objects.get(id=self.book_id)
544
545     book = property(get_book)
546
547     def get_parts(self):
548         book = self.book
549
550         def sections_covered(results):
551             frags = filter(lambda r: r[1] is not None, results)
552             sect = filter(lambda r: r[1] is None, results)
553             sect = filter(lambda s: 0 == len(filter(
554                 lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
555                 frags)), sect)
556             print "filtered, non overlapped sections: %s" % sect
557             return frags + sect
558             
559             
560         parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
561             + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
562
563         parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
564         print("bookid: %d parts: %s" % (self.book_id, parts))
565         return parts
566
567     parts = property(get_parts)
568
569
570     def __unicode__(self):
571         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
572
573     @staticmethod
574     def aggregate(*result_lists):
575         books = {}
576         for rl in result_lists:
577             for r in rl:
578                 if r.book_id in books:
579                     books[r.book_id].merge(r)
580                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
581                 else:
582                     books[r.book_id] = r
583         return books.values()
584
585     def __cmp__(self, other):
586         return cmp(self.score, other.score)
587
588
589 class MultiSearch(Search):
590     """Class capable of IMDb-like searching"""
591     def get_tokens(self, searched, field='content'):
592         """returns tokens analyzed by a proper (for a field) analyzer
593         argument can be: StringReader, string/unicode, or tokens. In the last case
594         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
595         """
596         if isinstance(searched, str) or isinstance(searched, unicode):
597             searched = StringReader(searched)
598         elif isinstance(searched, list):
599             return searched
600
601         searched.reset()
602         tokens = self.analyzer.reusableTokenStream(field, searched)
603         toks = []
604         while tokens.incrementToken():
605             cta = tokens.getAttribute(CharTermAttribute.class_)
606             toks.append(cta.toString())
607         return toks
608
609     def fuzziness(self, fuzzy):
610         if not fuzzy:
611             return None
612         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
613             return fuzzy
614         else:
615             return 0.5
616
617     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
618         if fuzzy:
619             phrase = MultiPhraseQuery()
620             for t in tokens:
621                 term = Term(field, t)
622                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
623                 fuzzterms = []
624
625                 while True:
626                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
627                     ft = fuzzterm.term()
628                     if ft:
629                         fuzzterms.append(ft)
630                     if not fuzzterm.next(): break
631                 if fuzzterms:
632                     phrase.add(JArray('object')(fuzzterms, Term))
633                 else:
634                     phrase.add(term)
635         else:
636             phrase = PhraseQuery()
637             phrase.setSlop(slop)
638             for t in tokens:
639                 term = Term(field, t)
640                 phrase.add(term)
641         return phrase
642
643     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
644         q = BooleanQuery()
645         for t in tokens:
646             term = Term(field, t)
647             if fuzzy:
648                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
649             else:
650                 term = TermQuery(term)
651             q.add(BooleanClause(term, modal))
652         return q
653
654     def content_query(self, query):
655         return BlockJoinQuery(query, self.parent_filter,
656                               BlockJoinQuery.ScoreMode.Total)
657
658     def search_perfect_book(self, searched, max_results=20, fuzzy=False):
659         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in ['author', 'title']]
660
661         books = []
662         for q in qrys:
663             top = self.searcher.search(q, max_results)
664             for found in top.scoreDocs:
665                 books.append(SearchResult(self.searcher, found))
666         return books
667
668     def search_perfect_parts(self, searched, max_results=20, fuzzy=False):
669         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
670
671         books = []
672         for q in qrys:
673             top = self.searcher.search(q, max_results)
674             for found in top.scoreDocs:
675                 books.append(SearchResult(self.searcher, found).add_snippets(self.get_snippets(found, q)))
676
677         return books
678
679     def search_everywhere(self, searched, max_results=20, fuzzy=False):
680         books = []
681
682         # content only query : themes x content
683         q = BooleanQuery()
684
685         tokens = self.get_tokens(searched)
686         q.add(BooleanClause(self.make_term_query(tokens, field='themes', fuzzy=fuzzy), BooleanClause.Occur.MUST))
687         q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
688
689         topDocs = self.searcher.search(q, max_results)
690         for found in topDocs.scoreDocs:
691             books.append(SearchResult(self.searcher, found))
692
693         # joined query themes/content x author/title/epochs/genres/kinds
694         # q = BooleanQuery()
695         # in_meta = BooleanQuery()
696         # in_content = BooleanQuery()
697
698         # for fld in ['themes', 'content']:
699         #     in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
700
701         # in_meta.add(BooleanClause(self.make_term_query(
702         #     self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
703
704         # for fld in ['title', 'epochs', 'genres', 'kinds']:
705         #     in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
706
707         # q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
708         # in_content_join = self.content_query(in_content)
709         # q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
710         # #        import pdb; pdb.set_trace()
711         # collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
712
713         # self.searcher.search(q, collector)
714
715         # top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
716         # if top_groups:
717         #     for grp in top_groups.groups:
718         #         for part in grp.scoreDocs:
719         #             books.append(SearchResult(self.searcher, part, score=grp.maxScore))
720         return books
721
722     def multisearch(self, query, max_results=50):
723         """
724         Search strategy:
725         - (phrase) OR -> content
726                       -> title
727                       -> author
728         - (keywords)  -> author
729                       -> motyw
730                       -> tags
731                       -> content
732         """
733         # queryreader = StringReader(query)
734         # tokens = self.get_tokens(queryreader)
735
736         # top_level = BooleanQuery()
737         # Should = BooleanClause.Occur.SHOULD
738
739         # phrase_level = BooleanQuery()
740         # phrase_level.setBoost(1.3)
741
742         # p_content = self.make_phrase(tokens, joined=True)
743         # p_title = self.make_phrase(tokens, 'title')
744         # p_author = self.make_phrase(tokens, 'author')
745
746         # phrase_level.add(BooleanClause(p_content, Should))
747         # phrase_level.add(BooleanClause(p_title, Should))
748         # phrase_level.add(BooleanClause(p_author, Should))
749
750         # kw_level = BooleanQuery()
751
752         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
753         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
754         # kw_level.add(j_themes, Should)
755         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
756         # j_con = self.make_term_query(tokens, joined=True)
757         # kw_level.add(j_con, Should)
758
759         # top_level.add(BooleanClause(phrase_level, Should))
760         # top_level.add(BooleanClause(kw_level, Should))
761
762         return None
763
764     def book_search(self, query, filter=None, max_results=50, collector=None):
765         tops = self.searcher.search(query, filter, max_results)
766         #tops = self.searcher.search(p_content, max_results)
767
768         bks = []
769         for found in tops.scoreDocs:
770             doc = self.searcher.doc(found.doc)
771             b = models.Book.objects.get(id=doc.get("book_id"))
772             bks.append(b)
773             print "%s (%d) -> %f" % (b, b.id, found.score)
774         return bks
775
776     def get_snippets(self, scoreDoc, query, field='content'):
777         htmlFormatter = SimpleHTMLFormatter()
778         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
779
780         stored = self.searcher.doc(scoreDoc.doc)
781
782         # locate content.
783         snippets = Snippets(stored.get('book_id')).open()
784         try:
785             text = snippets.get(stored.get('snippets_position'), stored.get('snippets_length'))
786         finally:
787             snippets.close()
788
789         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
790         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
791         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
792         print('snips: %s' % snip)
793
794         return [snip]
795
796     @staticmethod
797     def enum_to_array(enum):
798         """
799         Converts a lucene TermEnum to array of Terms, suitable for
800         addition to queries
801         """
802         terms = []
803
804         while True:
805             t = enum.term()
806             if t:
807                 terms.append(t)
808             if not enum.next(): break
809
810         if terms:
811             return JArray('object')(terms, Term)
812
813     def search_tags(self, query, filter=None, max_results=40):
814         tops = self.searcher.search(query, filter, max_results)
815
816         tags = []
817         for found in tops.scoreDocs:
818             doc = self.searcher.doc(found.doc)
819             tag = models.Tag.objects.get(id=doc.get("tag_id"))
820             tags.append(tag)
821             print "%s (%d) -> %f" % (tag, tag.id, found.score)
822
823         return tags
824
825     def create_prefix_phrase(self, toks, field):
826         q = MultiPhraseQuery()
827         for i in range(len(toks)):
828             t = Term(field, toks[i])
829             if i == len(toks) - 1:
830                 pterms = MultiSearch.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
831                 if pterms:
832                     q.add(pterms)
833                 else:
834                     q.add(t)
835             else:
836                 q.add(t)
837         return q
838
839     def hint_tags(self, string, max_results=50):
840         toks = self.get_tokens(string, field='SIMPLE')
841         top = BooleanQuery()
842
843         for field in ['tag_name', 'tag_name_pl']:
844             q = self.create_prefix_phrase(toks, field)
845             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
846
847         book_cat = TermsFilter()
848         book_cat.addTerm(Term("tag_category", "book"))
849
850         no_book_cat = BooleanFilter()
851         no_book_cat.add(FilterClause(book_cat, BooleanClause.Occur.MUST_NOT))
852
853         return self.search_tags(top, no_book_cat, max_results=max_results)
854
855     def hint_books(self, string, max_results=50):
856         toks = self.get_tokens(string, field='SIMPLE')
857
858         q = self.create_prefix_phrase(toks, 'title')
859         only_books = TermsFilter()
860         only_books.addTerm(Term("is_book", "true"))
861
862         return self.book_search(q, only_books, max_results=max_results)
863
864     def filter_by_tags(self, tags):
865         q = BooleanQuery()
866         
867         for tag in tags:
868             toks = self.get_tokens(tag.name, field='tags')
869             tag_phrase = PhraseQuery()
870             for tok in toks:
871                 tag_phrase.add(tok)
872             q.add(BooleanClause(tok, BooleanClause.Occur.MUST))
873
874         return QueryWrapperFilter(q)
875
876     def filtered_categories(self, tags):
877         cats = {}
878         for t in tags:
879             cats[t.category] = True
880         return cats.keys()