a3a62de18b3545b4ec130174e92db592d2469405
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("author", simple)
57         self.addAnalyzer("is_book", keyword)
58
59         self.addAnalyzer("themes", simple)
60         self.addAnalyzer("themes_pl", polish)
61
62         self.addAnalyzer("tag_name", simple)
63         self.addAnalyzer("tag_name_pl", polish)
64
65         self.addAnalyzer("KEYWORD", keyword)
66         self.addAnalyzer("SIMPLE", simple)
67         self.addAnalyzer("POLISH", polish)
68
69
70 class IndexStore(object):
71     def __init__(self):
72         self.make_index_dir()
73         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
74
75     def make_index_dir(self):
76         try:
77             os.makedirs(settings.SEARCH_INDEX)
78         except OSError as exc:
79             if exc.errno == errno.EEXIST:
80                 pass
81             else: raise
82
83
84 class IndexChecker(IndexStore):
85     def __init__(self):
86         IndexStore.__init__(self)
87
88     def check(self):
89         checker = CheckIndex(self.store)
90         status = checker.checkIndex()
91         return status
92
93
94 class Snippets(object):
95     SNIPPET_DIR = "snippets"
96
97     def __init__(self, book_id):
98         try:
99             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
100         except OSError as exc:
101             if exc.errno == errno.EEXIST:
102                 pass
103             else: raise
104         self.book_id = book_id
105         self.file = None
106
107     def open(self, mode='r'):
108         if not 'b' in mode:
109             mode += 'b'
110         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
111         self.position = 0
112         return self
113
114     def add(self, snippet):
115         txt = snippet.encode('utf-8')
116         l = len(txt)
117         self.file.write(txt)
118         pos = (self.position, l)
119         self.position += l
120         print "Snip<%s>%s</s>" %(pos, txt)
121         return pos
122
123     def get(self, pos):
124         self.file.seek(pos[0], 0)
125         txt = self.file.read(pos[1]).decode('utf-8')
126         print "got from snippets %d bytes from %s:" % (len(txt), pos)
127         return txt
128
129     def close(self):
130         self.file.close()
131
132
133 class Index(IndexStore):
134     def __init__(self, analyzer=None):
135         IndexStore.__init__(self)
136         self.index = None
137         if not analyzer:
138             analyzer = WLAnalyzer()
139         self.analyzer = analyzer
140
141     def open(self, analyzer=None):
142         if self.index:
143             raise Exception("Index is already opened")
144         self.index = IndexWriter(self.store, self.analyzer,\
145                                  IndexWriter.MaxFieldLength.LIMITED)
146         return self.index
147
148     def optimize(self):
149         self.index.optimize()
150
151     def close(self):
152         try:
153             self.index.optimize()
154         except JavaError, je:
155             print "Error during optimize phase, check index: %s" % je
156
157         self.index.close()
158         self.index = None
159
160     def index_tags(self):
161         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
162         self.index.deleteDocuments(q)
163
164         for tag in catalogue.models.Tag.objects.all():
165             doc = Document()
166             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
167             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
168             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
169             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
170             self.index.addDocument(doc)
171
172     def remove_book(self, book):
173         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
174         self.index.deleteDocuments(q)
175
176     def index_book(self, book, overwrite=True):
177         if overwrite:
178             self.remove_book(book)
179
180         book_doc = self.create_book_doc(book)
181         meta_fields = self.extract_metadata(book)
182         for f in meta_fields.values():
183             if isinstance(f, list) or isinstance(f, tuple):
184                 for elem in f:
185                     book_doc.add(elem)
186             else:
187                 book_doc.add(f)
188
189         self.index.addDocument(book_doc)
190         del book_doc
191
192         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['author']])
193
194     master_tags = [
195         'opowiadanie',
196         'powiesc',
197         'dramat_wierszowany_l',
198         'dramat_wierszowany_lp',
199         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
200         'wywiad'
201         ]
202
203     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
204
205     def create_book_doc(self, book):
206         """
207         Create a lucene document connected to the book
208         """
209         doc = Document()
210         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
211         if book.parent is not None:
212             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
213         return doc
214
215     def extract_metadata(self, book):
216         fields = {}
217         book_info = dcparser.parse(book.xml_file)
218
219         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
220
221         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
222         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
223         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
224
225         # validator, name
226         for field in dcparser.BookInfo.FIELDS:
227             if hasattr(book_info, field.name):
228                 if not getattr(book_info, field.name):
229                     continue
230                 # since no type information is available, we use validator
231                 type_indicator = field.validator
232                 if type_indicator == dcparser.as_unicode:
233                     s = getattr(book_info, field.name)
234                     if field.multiple:
235                         s = ', '.join(s)
236                     try:
237                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
238                     except JavaError as je:
239                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
240                 elif type_indicator == dcparser.as_person:
241                     p = getattr(book_info, field.name)
242                     if isinstance(p, dcparser.Person):
243                         persons = unicode(p)
244                     else:
245                         persons = ', '.join(map(unicode, p))
246                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
247                 elif type_indicator == dcparser.as_date:
248                     dt = getattr(book_info, field.name)
249                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
250                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
251         return fields
252
253     def get_master(self, root):
254         for master in root.iter():
255             if master.tag in self.master_tags:
256                 return master
257
258     def add_gaps(self, fields, fieldname):
259         def gap():
260             while True:
261                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
262         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
263
264     def index_content(self, book, book_fields=[]):
265         wld = WLDocument.from_file(book.xml_file.path)
266         root = wld.edoc.getroot()
267
268         master = self.get_master(root)
269         if master is None:
270             return []
271
272         def walker(node):
273             yield node, None
274             for child in list(node):
275                 for b, e in walker(child):
276                     yield b, e
277             yield None, node
278             return
279
280         def fix_format(text):
281             return re.sub("/$", "", text, flags=re.M)
282
283         def add_part(snippets, **fields):
284             doc = self.create_book_doc(book)
285             for f in book_fields:
286                 doc.add(f)
287
288             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
289             doc.add(NumericField("header_span", Field.Store.YES, True)\
290                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
291             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
292
293             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
294                           Field.TermVector.WITH_POSITIONS_OFFSETS))
295
296             snip_pos = snippets.add(fields["content"])
297             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
298             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
299
300             if 'fragment_anchor' in fields:
301                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
302                               Field.Store.YES, Field.Index.NOT_ANALYZED))
303
304             if 'themes' in fields:
305                 themes, themes_pl = zip(*[
306                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
307                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
308                      for theme in fields['themes']])
309
310                 themes = self.add_gaps(themes, 'themes')
311                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
312
313                 for t in themes:
314                     doc.add(t)
315                 for t in themes_pl:
316                     doc.add(t)
317
318             return doc
319
320         fragments = {}
321         snippets = Snippets(book.id).open('w')
322         try:
323             for header, position in zip(list(master), range(len(master))):
324
325                 if header.tag in self.skip_header_tags:
326                     continue
327
328                 content = u' '.join([t for t in header.itertext()])
329                 content = fix_format(content)
330
331                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
332
333                 self.index.addDocument(doc)
334
335                 for start, end in walker(header):
336                     if start is not None and start.tag == 'begin':
337                         fid = start.attrib['id'][1:]
338                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
339                         fragments[fid]['content'].append(start.tail)
340                     elif start is not None and start.tag == 'motyw':
341                         fid = start.attrib['id'][1:]
342                         fragments[fid]['themes'].append(start.text)
343                         fragments[fid]['content'].append(start.tail)
344                     elif start is not None and start.tag == 'end':
345                         fid = start.attrib['id'][1:]
346                         if fid not in fragments:
347                             continue  # a broken <end> node, skip it
348                         frag = fragments[fid]
349                         del fragments[fid]
350
351                         def jstr(l):
352                             return u' '.join(map(
353                                 lambda x: x == None and u'(none)' or unicode(x),
354                                 l))
355
356                         doc = add_part(snippets,
357                                        header_type=frag['start_header'],
358                                        header_index=frag['start_section'],
359                                        header_span=position - frag['start_section'] + 1,
360                                        fragment_anchor=fid,
361                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
362                                        themes=frag['themes'])
363
364                         self.index.addDocument(doc)
365                     elif start is not None:
366                         for frag in fragments.values():
367                             frag['content'].append(start.text)
368                     elif end is not None:
369                         for frag in fragments.values():
370                             frag['content'].append(end.tail)
371         finally:
372             snippets.close()
373
374
375     def __enter__(self):
376         self.open()
377         return self
378
379     def __exit__(self, type, value, tb):
380         self.close()
381
382
383 def log_exception_wrapper(f):
384     def _wrap(*a):
385         try:
386             f(*a)
387         except Exception, e:
388             print("Error in indexing thread: %s" % e)
389             traceback.print_exc()
390             raise e
391     return _wrap
392
393
394 class ReusableIndex(Index):
395     """
396     Works like index, but does not close/optimize Lucene index
397     until program exit (uses atexit hook).
398     This is usefull for importbooks command.
399
400     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
401     """
402     index = None
403     pool = None
404     pool_jobs = None
405
406     def open(self, analyzer=None, threads=4):
407         if ReusableIndex.index is not None:
408             self.index = ReusableIndex.index
409         else:
410             print("opening index")
411             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
412             ReusableIndex.pool_jobs = []
413             Index.open(self, analyzer)
414             ReusableIndex.index = self.index
415             atexit.register(ReusableIndex.close_reusable)
416
417     def index_book(self, *args, **kw):
418         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
419         ReusableIndex.pool_jobs.append(job)
420
421     @staticmethod
422     def close_reusable():
423         if ReusableIndex.index is not None:
424             print("wait for indexing to finish")
425             for job in ReusableIndex.pool_jobs:
426                 job.get()
427                 sys.stdout.write('.')
428                 sys.stdout.flush()
429             print("done.")
430             ReusableIndex.pool.close()
431
432             ReusableIndex.index.optimize()
433             ReusableIndex.index.close()
434             ReusableIndex.index = None
435
436     def close(self):
437         pass
438
439
440 class Search(IndexStore):
441     def __init__(self, default_field="content"):
442         IndexStore.__init__(self)
443         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
444         ## self.analyzer = WLAnalyzer()
445         self.searcher = IndexSearcher(self.store, True)
446         self.parser = QueryParser(Version.LUCENE_34, default_field,
447                                   self.analyzer)
448
449         self.parent_filter = TermsFilter()
450         self.parent_filter.addTerm(Term("is_book", "true"))
451
452     def query(self, query):
453         return self.parser.parse(query)
454
455     def wrapjoins(self, query, fields=[]):
456         """
457         This functions modifies the query in a recursive way,
458         so Term and Phrase Queries contained, which match
459         provided fields are wrapped in a BlockJoinQuery,
460         and so delegated to children documents.
461         """
462         if BooleanQuery.instance_(query):
463             qs = BooleanQuery.cast_(query)
464             for clause in qs:
465                 clause = BooleanClause.cast_(clause)
466                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
467             return qs
468         else:
469             termset = HashSet()
470             query.extractTerms(termset)
471             for t in termset:
472                 t = Term.cast_(t)
473                 if t.field() not in fields:
474                     return query
475             return BlockJoinQuery(query, self.parent_filter,
476                                   BlockJoinQuery.ScoreMode.Total)
477
478     def simple_search(self, query, max_results=50):
479         """Returns (books, total_hits)
480         """
481
482         tops = self.searcher.search(self.query(query), max_results)
483         bks = []
484         for found in tops.scoreDocs:
485             doc = self.searcher.doc(found.doc)
486             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
487         return (bks, tops.totalHits)
488
489     def search(self, query, max_results=50):
490         query = self.query(query)
491         query = self.wrapjoins(query, ["content", "themes"])
492
493         tops = self.searcher.search(query, max_results)
494         bks = []
495         for found in tops.scoreDocs:
496             doc = self.searcher.doc(found.doc)
497             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
498         return (bks, tops.totalHits)
499
500     def bsearch(self, query, max_results=50):
501         q = self.query(query)
502         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
503
504         tops = self.searcher.search(bjq, max_results)
505         bks = []
506         for found in tops.scoreDocs:
507             doc = self.searcher.doc(found.doc)
508             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
509         return (bks, tops.totalHits)
510
511 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
512 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
513 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
514
515 # while (tokenStream.incrementToken()) {
516 #     int startOffset = offsetAttribute.startOffset();
517 #     int endOffset = offsetAttribute.endOffset();
518 #     String term = charTermAttribute.toString();
519 # }
520
521
522 class SearchResult(object):
523     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
524         self.snippets = []
525
526         if score:
527             self.score = score
528         else:
529             self.score = scoreDocs.score
530
531         self.hits = []
532
533         stored = searcher.doc(scoreDocs.doc)
534         self.book_id = int(stored.get("book_id"))
535
536         header_type = stored.get("header_type")
537         if not header_type:
538             return
539
540         sec = (header_type, int(stored.get("header_index")))
541         header_span = stored.get('header_span')
542         header_span = header_span is not None and int(header_span) or 1
543
544         fragment = stored.get("fragment_anchor")
545
546         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets})
547
548         self.hits.append(hit)
549
550     def merge(self, other):
551         if self.book_id != other.book_id:
552             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
553         self.hits += other.hits
554         if other.score > self.score:
555             self.score = other.score
556         return self
557
558     def get_book(self):
559         return catalogue.models.Book.objects.get(id=self.book_id)
560
561     book = property(get_book)
562
563     def get_parts(self):
564         book = self.book
565
566         def sections_covered(results):
567             frags = filter(lambda r: r[1] is not None, results)
568             sect = filter(lambda r: r[1] is None, results)
569             sect = filter(lambda s: 0 == len(filter(
570                 lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
571                 frags)), sect)
572             print "filtered, non overlapped sections: %s" % sect
573             return frags + sect
574
575         parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
576             + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
577
578         parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
579         print("bookid: %d parts: %s" % (self.book_id, parts))
580         return parts
581
582     parts = property(get_parts)
583
584     def __unicode__(self):
585         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
586
587     @staticmethod
588     def aggregate(*result_lists):
589         books = {}
590         for rl in result_lists:
591             for r in rl:
592                 if r.book_id in books:
593                     books[r.book_id].merge(r)
594                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
595                 else:
596                     books[r.book_id] = r
597         return books.values()
598
599     def __cmp__(self, other):
600         return cmp(self.score, other.score)
601
602
603 class Hint(object):
604     def __init__(self, search):
605         self.search = search
606         self.book_tags = {}
607         self.part_tags = []
608         self.book = None
609
610     def book(self, book):
611         self.book = book
612
613     def tags(self, tags):
614         for t in tags:
615             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
616                 lst = self.book_tags.get(t.category, [])
617                 lst.append(t)
618                 self.book_tags[t.category] = lst
619             if t.category in ['theme']:
620                 self.part_tags.append(t)
621
622     def tag_filter(self, tags, field='tags'):
623         q = BooleanQuery()
624
625         for tag in tags:
626             toks = self.search.get_tokens(tag.name, field=field)
627             tag_phrase = PhraseQuery()
628             for tok in toks:
629                 tag_phrase.add(Term(field, tok))
630             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
631
632         return QueryWrapperFilter(q)
633
634     def book_filter(self):
635         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
636         if tags:
637             return self.tag_filter(tags)
638         else:
639             return None
640
641     def part_filter(self):
642         if self.part_tags:
643             return self.tag_filter(self.part_tags, field='themes')
644         else:
645             return None
646
647     def should_search_for_book(self):
648         return self.book is None
649
650     def just_search_in(self, all):
651         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
652         some = []
653         for field in all:
654             if field == 'author' and 'author' in self.book_tags:
655                 continue
656             if field == 'title' and self.book is not None:
657                 continue
658             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
659                 continue
660             some.append(field)
661         return some
662
663
664 class MultiSearch(Search):
665     """Class capable of IMDb-like searching"""
666     def get_tokens(self, searched, field='content'):
667         """returns tokens analyzed by a proper (for a field) analyzer
668         argument can be: StringReader, string/unicode, or tokens. In the last case
669         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
670         """
671         if isinstance(searched, str) or isinstance(searched, unicode):
672             searched = StringReader(searched)
673         elif isinstance(searched, list):
674             return searched
675
676         searched.reset()
677         tokens = self.analyzer.reusableTokenStream(field, searched)
678         toks = []
679         while tokens.incrementToken():
680             cta = tokens.getAttribute(CharTermAttribute.class_)
681             toks.append(cta.toString())
682         return toks
683
684     def fuzziness(self, fuzzy):
685         if not fuzzy:
686             return None
687         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
688             return fuzzy
689         else:
690             return 0.5
691
692     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
693         if fuzzy:
694             phrase = MultiPhraseQuery()
695             for t in tokens:
696                 term = Term(field, t)
697                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
698                 fuzzterms = []
699
700                 while True:
701                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
702                     ft = fuzzterm.term()
703                     if ft:
704                         fuzzterms.append(ft)
705                     if not fuzzterm.next(): break
706                 if fuzzterms:
707                     phrase.add(JArray('object')(fuzzterms, Term))
708                 else:
709                     phrase.add(term)
710         else:
711             phrase = PhraseQuery()
712             phrase.setSlop(slop)
713             for t in tokens:
714                 term = Term(field, t)
715                 phrase.add(term)
716         return phrase
717
718     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
719         q = BooleanQuery()
720         for t in tokens:
721             term = Term(field, t)
722             if fuzzy:
723                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
724             else:
725                 term = TermQuery(term)
726             q.add(BooleanClause(term, modal))
727         return q
728
729     def content_query(self, query):
730         return BlockJoinQuery(query, self.parent_filter,
731                               BlockJoinQuery.ScoreMode.Total)
732
733     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
734         fields_to_search = ['author', 'title']
735         only_in = None
736         if hint:
737             if not hint.should_search_for_book():
738                 return []
739             fields_to_search = hint.just_search_in(fields_to_search)
740             only_in = hint.book_filter()
741
742         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
743
744         books = []
745         for q in qrys:
746             top = self.searcher.search(q,
747                                        self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
748                 max_results)
749             for found in top.scoreDocs:
750                 books.append(SearchResult(self.searcher, found))
751         return books
752
753     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
754         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
755
756         flt = None
757         if hint:
758             flt = hint.part_filter()
759
760         books = []
761         for q in qrys:
762             top = self.searcher.search(q,
763                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
764                                                            flt
765                                                           ]),
766                                        max_results)
767             for found in top.scoreDocs:
768                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
769
770         return books
771
772     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
773         books = []
774         only_in = None
775
776         if hint:
777             only_in = hint.part_filter()
778
779         # content only query : themes x content
780         q = BooleanQuery()
781
782         tokens = self.get_tokens(searched)
783         if hint is None or hint.just_search_in(['themes_pl']) != []:
784             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl', fuzzy=fuzzy), BooleanClause.Occur.MUST))
785
786         q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
787
788         topDocs = self.searcher.search(q, only_in, max_results)
789         for found in topDocs.scoreDocs:
790             books.append(SearchResult(self.searcher, found))
791
792         # joined query themes/content x author/title/epochs/genres/kinds
793         # q = BooleanQuery()
794         # in_meta = BooleanQuery()
795         # in_content = BooleanQuery()
796
797         # for fld in ['themes', 'content']:
798         #     in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
799
800         # in_meta.add(BooleanClause(self.make_term_query(
801         #     self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
802
803         # for fld in ['title', 'epochs', 'genres', 'kinds']:
804         #     in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
805
806         # q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
807         # in_content_join = self.content_query(in_content)
808         # q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
809         # #        import pdb; pdb.set_trace()
810         # collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
811
812         # self.searcher.search(q, collector)
813
814         # top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
815         # if top_groups:
816         #     for grp in top_groups.groups:
817         #         for part in grp.scoreDocs:
818         #             books.append(SearchResult(self.searcher, part, score=grp.maxScore))
819         return books
820
821     def multisearch(self, query, max_results=50):
822         """
823         Search strategy:
824         - (phrase) OR -> content
825                       -> title
826                       -> author
827         - (keywords)  -> author
828                       -> motyw
829                       -> tags
830                       -> content
831         """
832         # queryreader = StringReader(query)
833         # tokens = self.get_tokens(queryreader)
834
835         # top_level = BooleanQuery()
836         # Should = BooleanClause.Occur.SHOULD
837
838         # phrase_level = BooleanQuery()
839         # phrase_level.setBoost(1.3)
840
841         # p_content = self.make_phrase(tokens, joined=True)
842         # p_title = self.make_phrase(tokens, 'title')
843         # p_author = self.make_phrase(tokens, 'author')
844
845         # phrase_level.add(BooleanClause(p_content, Should))
846         # phrase_level.add(BooleanClause(p_title, Should))
847         # phrase_level.add(BooleanClause(p_author, Should))
848
849         # kw_level = BooleanQuery()
850
851         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
852         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
853         # kw_level.add(j_themes, Should)
854         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
855         # j_con = self.make_term_query(tokens, joined=True)
856         # kw_level.add(j_con, Should)
857
858         # top_level.add(BooleanClause(phrase_level, Should))
859         # top_level.add(BooleanClause(kw_level, Should))
860
861         return None
862
863     def book_search(self, query, filter=None, max_results=50, collector=None):
864         tops = self.searcher.search(query, filter, max_results)
865         #tops = self.searcher.search(p_content, max_results)
866
867         bks = []
868         for found in tops.scoreDocs:
869             doc = self.searcher.doc(found.doc)
870             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
871             bks.append(b)
872             print "%s (%d) -> %f" % (b, b.id, found.score)
873         return bks
874
875     def get_snippets(self, scoreDoc, query, field='content'):
876         htmlFormatter = SimpleHTMLFormatter()
877         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
878
879         stored = self.searcher.doc(scoreDoc.doc)
880
881         # locate content.
882         snippets = Snippets(stored.get('book_id')).open()
883         try:
884             text = snippets.get((int(stored.get('snippets_position')),
885                                  int(stored.get('snippets_length'))))
886         finally:
887             snippets.close()
888
889         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
890         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
891         #        import pdb; pdb.set_trace()
892         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
893         print('snips: %s' % snip)
894
895         return [snip]
896
897     @staticmethod
898     def enum_to_array(enum):
899         """
900         Converts a lucene TermEnum to array of Terms, suitable for
901         addition to queries
902         """
903         terms = []
904
905         while True:
906             t = enum.term()
907             if t:
908                 terms.append(t)
909             if not enum.next(): break
910
911         if terms:
912             return JArray('object')(terms, Term)
913
914     def search_tags(self, query, filter=None, max_results=40):
915         tops = self.searcher.search(query, filter, max_results)
916
917         tags = []
918         for found in tops.scoreDocs:
919             doc = self.searcher.doc(found.doc)
920             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
921             tags.append(tag)
922             print "%s (%d) -> %f" % (tag, tag.id, found.score)
923
924         return tags
925
926     def create_prefix_phrase(self, toks, field):
927         q = MultiPhraseQuery()
928         for i in range(len(toks)):
929             t = Term(field, toks[i])
930             if i == len(toks) - 1:
931                 pterms = MultiSearch.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
932                 if pterms:
933                     q.add(pterms)
934                 else:
935                     q.add(t)
936             else:
937                 q.add(t)
938         return q
939
940     @staticmethod
941     def term_filter(term, inverse=False):
942         only_term = TermsFilter()
943         only_term.addTerm(term)
944
945         if inverse:
946             neg = BooleanFilter()
947             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
948             only_term = neg
949
950         return only_term
951
952     def hint_tags(self, string, max_results=50):
953         toks = self.get_tokens(string, field='SIMPLE')
954         top = BooleanQuery()
955
956         for field in ['tag_name', 'tag_name_pl']:
957             q = self.create_prefix_phrase(toks, field)
958             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
959
960         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
961
962         return self.search_tags(top, no_book_cat, max_results=max_results)
963
964     def hint_books(self, string, max_results=50):
965         toks = self.get_tokens(string, field='SIMPLE')
966
967         q = self.create_prefix_phrase(toks, 'title')
968
969         return self.book_search(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
970
971     @staticmethod
972     def chain_filters(filters, op=ChainedFilter.AND):
973         filters = filter(lambda x: x is not None, filters)
974         if not filters:
975             return None
976         chf = ChainedFilter(JArray('object')(filters, Filter), op)
977         return chf
978
979     def filtered_categories(self, tags):
980         cats = {}
981         for t in tags:
982             cats[t.category] = True
983         return cats.keys()
984
985     def hint(self):
986         return Hint(self)