be prepared for mixes unicode/str input from lxml..
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("author", simple)
57         self.addAnalyzer("is_book", keyword)
58
59         self.addAnalyzer("themes", simple)
60         self.addAnalyzer("themes_pl", polish)
61
62         self.addAnalyzer("tag_name", simple)
63         self.addAnalyzer("tag_name_pl", polish)
64
65         self.addAnalyzer("KEYWORD", keyword)
66         self.addAnalyzer("SIMPLE", simple)
67         self.addAnalyzer("POLISH", polish)
68
69
70 class IndexStore(object):
71     def __init__(self):
72         self.make_index_dir()
73         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
74
75     def make_index_dir(self):
76         try:
77             os.makedirs(settings.SEARCH_INDEX)
78         except OSError as exc:
79             if exc.errno == errno.EEXIST:
80                 pass
81             else: raise
82
83
84 class IndexChecker(IndexStore):
85     def __init__(self):
86         IndexStore.__init__(self)
87
88     def check(self):
89         checker = CheckIndex(self.store)
90         status = checker.checkIndex()
91         return status
92
93
94 class Snippets(object):
95     SNIPPET_DIR = "snippets"
96
97     def __init__(self, book_id):
98         try:
99             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
100         except OSError as exc:
101             if exc.errno == errno.EEXIST:
102                 pass
103             else: raise
104         self.book_id = book_id
105         self.file = None
106
107     def open(self, mode='r'):
108         if not 'b' in mode:
109             mode += 'b'
110         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
111         self.position = 0
112         return self
113
114     def add(self, snippet):
115         txt = snippet.encode('utf-8')
116         l = len(txt)
117         self.file.write(txt)
118         pos = (self.position, l)
119         self.position += l
120         return pos
121
122     def get(self, pos):
123         self.file.seek(pos[0], 0)
124         txt = self.file.read(pos[1]).decode('utf-8')
125         return txt
126
127     def close(self):
128         self.file.close()
129
130
131 class Index(IndexStore):
132     def __init__(self, analyzer=None):
133         IndexStore.__init__(self)
134         self.index = None
135         if not analyzer:
136             analyzer = WLAnalyzer()
137         self.analyzer = analyzer
138
139     def open(self, analyzer=None):
140         if self.index:
141             raise Exception("Index is already opened")
142         self.index = IndexWriter(self.store, self.analyzer,\
143                                  IndexWriter.MaxFieldLength.LIMITED)
144         return self.index
145
146     def optimize(self):
147         self.index.optimize()
148
149     def close(self):
150         try:
151             self.index.optimize()
152         except JavaError, je:
153             print "Error during optimize phase, check index: %s" % je
154
155         self.index.close()
156         self.index = None
157
158     def index_tags(self):
159         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
160         self.index.deleteDocuments(q)
161
162         for tag in catalogue.models.Tag.objects.all():
163             doc = Document()
164             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
165             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
166             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
167             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
168             self.index.addDocument(doc)
169
170     def remove_book(self, book):
171         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
172         self.index.deleteDocuments(q)
173
174     def index_book(self, book, overwrite=True):
175         if overwrite:
176             self.remove_book(book)
177
178         book_doc = self.create_book_doc(book)
179         meta_fields = self.extract_metadata(book)
180         for f in meta_fields.values():
181             if isinstance(f, list) or isinstance(f, tuple):
182                 for elem in f:
183                     book_doc.add(elem)
184             else:
185                 book_doc.add(f)
186
187         self.index.addDocument(book_doc)
188         del book_doc
189
190         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['author']])
191
192     master_tags = [
193         'opowiadanie',
194         'powiesc',
195         'dramat_wierszowany_l',
196         'dramat_wierszowany_lp',
197         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
198         'wywiad'
199         ]
200
201     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
202
203     def create_book_doc(self, book):
204         """
205         Create a lucene document connected to the book
206         """
207         doc = Document()
208         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
209         if book.parent is not None:
210             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
211         return doc
212
213     def extract_metadata(self, book):
214         fields = {}
215         book_info = dcparser.parse(book.xml_file)
216
217         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
218
219         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
220         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
221         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
222
223         # validator, name
224         for field in dcparser.BookInfo.FIELDS:
225             if hasattr(book_info, field.name):
226                 if not getattr(book_info, field.name):
227                     continue
228                 # since no type information is available, we use validator
229                 type_indicator = field.validator
230                 if type_indicator == dcparser.as_unicode:
231                     s = getattr(book_info, field.name)
232                     if field.multiple:
233                         s = ', '.join(s)
234                     try:
235                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
236                     except JavaError as je:
237                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
238                 elif type_indicator == dcparser.as_person:
239                     p = getattr(book_info, field.name)
240                     if isinstance(p, dcparser.Person):
241                         persons = unicode(p)
242                     else:
243                         persons = ', '.join(map(unicode, p))
244                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
245                 elif type_indicator == dcparser.as_date:
246                     dt = getattr(book_info, field.name)
247                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
248                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
249         return fields
250
251     def get_master(self, root):
252         for master in root.iter():
253             if master.tag in self.master_tags:
254                 return master
255
256     def add_gaps(self, fields, fieldname):
257         def gap():
258             while True:
259                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
260         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
261
262     def index_content(self, book, book_fields=[]):
263         wld = WLDocument.from_file(book.xml_file.path)
264         root = wld.edoc.getroot()
265
266         master = self.get_master(root)
267         if master is None:
268             return []
269
270         def walker(node):
271             yield node, None
272             for child in list(node):
273                 for b, e in walker(child):
274                     yield b, e
275             yield None, node
276             return
277
278         def fix_format(text):
279             return re.sub("/$", "", text, flags=re.M)
280
281         def add_part(snippets, **fields):
282             doc = self.create_book_doc(book)
283             for f in book_fields:
284                 doc.add(f)
285
286             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
287             doc.add(NumericField("header_span", Field.Store.YES, True)\
288                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
289             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
290
291             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
292                           Field.TermVector.WITH_POSITIONS_OFFSETS))
293
294             snip_pos = snippets.add(fields["content"])
295             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
296             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
297
298             if 'fragment_anchor' in fields:
299                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
300                               Field.Store.YES, Field.Index.NOT_ANALYZED))
301
302             if 'themes' in fields:
303                 themes, themes_pl = zip(*[
304                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
305                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
306                      for theme in fields['themes']])
307
308                 themes = self.add_gaps(themes, 'themes')
309                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
310
311                 for t in themes:
312                     doc.add(t)
313                 for t in themes_pl:
314                     doc.add(t)
315
316             return doc
317
318         def give_me_utf8(s):
319             if isinstance(s, unicode):
320                 return s.encode('utf-8')
321             else:
322                 return s
323
324
325         fragments = {}
326         snippets = Snippets(book.id).open('w')
327         try:
328             for header, position in zip(list(master), range(len(master))):
329
330                 if header.tag in self.skip_header_tags:
331                     continue
332
333                 content = u' '.join([t for t in header.itertext()])
334                 content = fix_format(content)
335
336                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
337
338                 self.index.addDocument(doc)
339
340                 for start, end in walker(header):
341                     if start is not None and start.tag == 'begin':
342                         fid = start.attrib['id'][1:]
343                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
344                         fragments[fid]['content'].append(start.tail)
345                     elif start is not None and start.tag == 'motyw':
346                         fid = start.attrib['id'][1:]
347                         if start.text is not None:
348                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
349                         fragments[fid]['content'].append(start.tail)
350                     elif start is not None and start.tag == 'end':
351                         fid = start.attrib['id'][1:]
352                         if fid not in fragments:
353                             continue  # a broken <end> node, skip it
354                         frag = fragments[fid]
355                         if frag['themes'] == []:
356                             continue  # empty themes list.
357                         del fragments[fid]
358
359                         def jstr(l):
360                             return u' '.join(map(
361                                 lambda x: x == None and u'(none)' or unicode(x),
362                                 l))
363
364                         doc = add_part(snippets,
365                                        header_type=frag['start_header'],
366                                        header_index=frag['start_section'],
367                                        header_span=position - frag['start_section'] + 1,
368                                        fragment_anchor=fid,
369                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
370                                        themes=frag['themes'])
371
372                         self.index.addDocument(doc)
373                     elif start is not None:
374                         for frag in fragments.values():
375                             frag['content'].append(start.text)
376                     elif end is not None:
377                         for frag in fragments.values():
378                             frag['content'].append(end.tail)
379         finally:
380             snippets.close()
381
382
383     def __enter__(self):
384         self.open()
385         return self
386
387     def __exit__(self, type, value, tb):
388         self.close()
389
390
391 def log_exception_wrapper(f):
392     def _wrap(*a):
393         try:
394             f(*a)
395         except Exception, e:
396             print("Error in indexing thread: %s" % e)
397             traceback.print_exc()
398             raise e
399     return _wrap
400
401
402 class ReusableIndex(Index):
403     """
404     Works like index, but does not close/optimize Lucene index
405     until program exit (uses atexit hook).
406     This is usefull for importbooks command.
407
408     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
409     """
410     index = None
411     pool = None
412     pool_jobs = None
413
414     def open(self, analyzer=None, threads=4):
415         if ReusableIndex.index is not None:
416             self.index = ReusableIndex.index
417         else:
418             print("opening index")
419             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
420             ReusableIndex.pool_jobs = []
421             Index.open(self, analyzer)
422             ReusableIndex.index = self.index
423             atexit.register(ReusableIndex.close_reusable)
424
425     def index_book(self, *args, **kw):
426         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
427         ReusableIndex.pool_jobs.append(job)
428
429     @staticmethod
430     def close_reusable():
431         if ReusableIndex.index is not None:
432             print("wait for indexing to finish")
433             for job in ReusableIndex.pool_jobs:
434                 job.get()
435                 sys.stdout.write('.')
436                 sys.stdout.flush()
437             print("done.")
438             ReusableIndex.pool.close()
439
440             ReusableIndex.index.optimize()
441             ReusableIndex.index.close()
442             ReusableIndex.index = None
443
444     def close(self):
445         pass
446
447
448 class Search(IndexStore):
449     def __init__(self, default_field="content"):
450         IndexStore.__init__(self)
451         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
452         ## self.analyzer = WLAnalyzer()
453         self.searcher = IndexSearcher(self.store, True)
454         self.parser = QueryParser(Version.LUCENE_34, default_field,
455                                   self.analyzer)
456
457         self.parent_filter = TermsFilter()
458         self.parent_filter.addTerm(Term("is_book", "true"))
459
460     def query(self, query):
461         return self.parser.parse(query)
462
463     def wrapjoins(self, query, fields=[]):
464         """
465         This functions modifies the query in a recursive way,
466         so Term and Phrase Queries contained, which match
467         provided fields are wrapped in a BlockJoinQuery,
468         and so delegated to children documents.
469         """
470         if BooleanQuery.instance_(query):
471             qs = BooleanQuery.cast_(query)
472             for clause in qs:
473                 clause = BooleanClause.cast_(clause)
474                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
475             return qs
476         else:
477             termset = HashSet()
478             query.extractTerms(termset)
479             for t in termset:
480                 t = Term.cast_(t)
481                 if t.field() not in fields:
482                     return query
483             return BlockJoinQuery(query, self.parent_filter,
484                                   BlockJoinQuery.ScoreMode.Total)
485
486     def simple_search(self, query, max_results=50):
487         """Returns (books, total_hits)
488         """
489
490         tops = self.searcher.search(self.query(query), max_results)
491         bks = []
492         for found in tops.scoreDocs:
493             doc = self.searcher.doc(found.doc)
494             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
495         return (bks, tops.totalHits)
496
497     def search(self, query, max_results=50):
498         query = self.query(query)
499         query = self.wrapjoins(query, ["content", "themes"])
500
501         tops = self.searcher.search(query, max_results)
502         bks = []
503         for found in tops.scoreDocs:
504             doc = self.searcher.doc(found.doc)
505             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
506         return (bks, tops.totalHits)
507
508     def bsearch(self, query, max_results=50):
509         q = self.query(query)
510         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
511
512         tops = self.searcher.search(bjq, max_results)
513         bks = []
514         for found in tops.scoreDocs:
515             doc = self.searcher.doc(found.doc)
516             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
517         return (bks, tops.totalHits)
518
519 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
520 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
521 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
522
523 # while (tokenStream.incrementToken()) {
524 #     int startOffset = offsetAttribute.startOffset();
525 #     int endOffset = offsetAttribute.endOffset();
526 #     String term = charTermAttribute.toString();
527 # }
528
529
530 class SearchResult(object):
531     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
532         self.snippets = []
533
534         if score:
535             self.score = score
536         else:
537             self.score = scoreDocs.score
538
539         self.hits = []
540
541         stored = searcher.doc(scoreDocs.doc)
542         self.book_id = int(stored.get("book_id"))
543
544         header_type = stored.get("header_type")
545         if not header_type:
546             return
547
548         sec = (header_type, int(stored.get("header_index")))
549         header_span = stored.get('header_span')
550         header_span = header_span is not None and int(header_span) or 1
551
552         fragment = stored.get("fragment_anchor")
553
554         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets})
555
556         self.hits.append(hit)
557
558     def merge(self, other):
559         if self.book_id != other.book_id:
560             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
561         self.hits += other.hits
562         if other.score > self.score:
563             self.score = other.score
564         return self
565
566     def get_book(self):
567         return catalogue.models.Book.objects.get(id=self.book_id)
568
569     book = property(get_book)
570
571     def process_hits(self):
572         frags = filter(lambda r: r[1] is not None, self.hits)
573         sect = filter(lambda r: r[1] is None, self.hits)
574         sect = filter(lambda s: 0 == len(filter(
575             lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
576             frags)), sect)
577
578         hits = []
579
580         for s in sect:
581             m = {'score': s[2],
582                  'header_index': s[0][1]
583                  }
584             m.update(s[3])
585             hits.append(m)
586
587         for f in frags:
588             frag = catalogue.models.Fragment.objects.get(anchor=f[1])
589             m = {'score': f[2],
590                  'fragment': frag,
591                  'themes': frag.tags.filter(category='theme')
592                  }
593             m.update(f[3])
594             hits.append(m)
595
596         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
597
598         print("--- %s" % hits)
599
600         return hits
601
602     def __unicode__(self):
603         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
604
605     @staticmethod
606     def aggregate(*result_lists):
607         books = {}
608         for rl in result_lists:
609             for r in rl:
610                 if r.book_id in books:
611                     books[r.book_id].merge(r)
612                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
613                 else:
614                     books[r.book_id] = r
615         return books.values()
616
617     def __cmp__(self, other):
618         return cmp(self.score, other.score)
619
620
621 class Hint(object):
622     def __init__(self, search):
623         self.search = search
624         self.book_tags = {}
625         self.part_tags = []
626         self._book = None
627
628     def book(self, book):
629         self._book = book
630
631     def tags(self, tags):
632         for t in tags:
633             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
634                 lst = self.book_tags.get(t.category, [])
635                 lst.append(t)
636                 self.book_tags[t.category] = lst
637             if t.category in ['theme']:
638                 self.part_tags.append(t)
639
640     def tag_filter(self, tags, field='tags'):
641         q = BooleanQuery()
642
643         for tag in tags:
644             toks = self.search.get_tokens(tag.name, field=field)
645             tag_phrase = PhraseQuery()
646             for tok in toks:
647                 tag_phrase.add(Term(field, tok))
648             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
649
650         return QueryWrapperFilter(q)
651
652     def book_filter(self):
653         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
654         if tags:
655             return self.tag_filter(tags)
656         else:
657             return None
658
659     def part_filter(self):
660         fs = []
661         if self.part_tags:
662             fs.append(self.tag_filter(self.part_tags, field='themes'))
663         if self._book is not None:
664             fs.append(NumericRangeFilter.newIntRange('book_id', self._book.id, self._book.id, True, True))
665         return MultiSearch.chain_filters(fs)
666             
667     def should_search_for_book(self):
668         return self._book is None
669
670     def just_search_in(self, all):
671         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
672         some = []
673         for field in all:
674             if field == 'author' and 'author' in self.book_tags:
675                 continue
676             if field == 'title' and self._book is not None:
677                 continue
678             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
679                 continue
680             some.append(field)
681         return some
682
683
684 class MultiSearch(Search):
685     """Class capable of IMDb-like searching"""
686     def get_tokens(self, searched, field='content'):
687         """returns tokens analyzed by a proper (for a field) analyzer
688         argument can be: StringReader, string/unicode, or tokens. In the last case
689         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
690         """
691         if isinstance(searched, str) or isinstance(searched, unicode):
692             searched = StringReader(searched)
693         elif isinstance(searched, list):
694             return searched
695
696         searched.reset()
697         tokens = self.analyzer.reusableTokenStream(field, searched)
698         toks = []
699         while tokens.incrementToken():
700             cta = tokens.getAttribute(CharTermAttribute.class_)
701             toks.append(cta.toString())
702         return toks
703
704     def fuzziness(self, fuzzy):
705         if not fuzzy:
706             return None
707         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
708             return fuzzy
709         else:
710             return 0.5
711
712     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
713         if fuzzy:
714             phrase = MultiPhraseQuery()
715             for t in tokens:
716                 term = Term(field, t)
717                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
718                 fuzzterms = []
719
720                 while True:
721                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
722                     ft = fuzzterm.term()
723                     if ft:
724                         fuzzterms.append(ft)
725                     if not fuzzterm.next(): break
726                 if fuzzterms:
727                     phrase.add(JArray('object')(fuzzterms, Term))
728                 else:
729                     phrase.add(term)
730         else:
731             phrase = PhraseQuery()
732             phrase.setSlop(slop)
733             for t in tokens:
734                 term = Term(field, t)
735                 phrase.add(term)
736         return phrase
737
738     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
739         q = BooleanQuery()
740         for t in tokens:
741             term = Term(field, t)
742             if fuzzy:
743                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
744             else:
745                 term = TermQuery(term)
746             q.add(BooleanClause(term, modal))
747         return q
748
749     def content_query(self, query):
750         return BlockJoinQuery(query, self.parent_filter,
751                               BlockJoinQuery.ScoreMode.Total)
752
753     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
754         fields_to_search = ['author', 'title']
755         only_in = None
756         if hint:
757             if not hint.should_search_for_book():
758                 return []
759             fields_to_search = hint.just_search_in(fields_to_search)
760             only_in = hint.book_filter()
761
762         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
763
764         books = []
765         for q in qrys:
766             top = self.searcher.search(q,
767                                        self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
768                 max_results)
769             for found in top.scoreDocs:
770                 books.append(SearchResult(self.searcher, found))
771         return books
772
773     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
774         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
775
776         flt = None
777         if hint:
778             flt = hint.part_filter()
779
780         books = []
781         for q in qrys:
782             top = self.searcher.search(q,
783                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
784                                                            flt
785                                                           ]),
786                                        max_results)
787             for found in top.scoreDocs:
788                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
789
790         return books
791
792     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
793         books = []
794         only_in = None
795
796         if hint:
797             only_in = hint.part_filter()
798
799         # content only query : themes x content
800         q = BooleanQuery()
801
802         tokens = self.get_tokens(searched)
803         if hint is None or hint.just_search_in(['themes_pl']) != []:
804             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
805                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
806
807         q.add(BooleanClause(self.make_term_query(tokens, field='content',
808                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
809
810         topDocs = self.searcher.search(q, only_in, max_results)
811         for found in topDocs.scoreDocs:
812             books.append(SearchResult(self.searcher, found))
813
814         # query themes/content x author/title/tags
815         q = BooleanQuery()
816         in_meta = BooleanQuery()
817         in_content = BooleanQuery()
818
819         for fld in ['themes', 'content', 'tags', 'author', 'title']:
820             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
821
822         topDocs = self.searcher.search(q, only_in, max_results)
823         for found in topDocs.scoreDocs:
824             books.append(SearchResult(self.searcher, found))
825
826         return books
827     
828
829     def multisearch(self, query, max_results=50):
830         """
831         Search strategy:
832         - (phrase) OR -> content
833                       -> title
834                       -> author
835         - (keywords)  -> author
836                       -> motyw
837                       -> tags
838                       -> content
839         """
840         # queryreader = StringReader(query)
841         # tokens = self.get_tokens(queryreader)
842
843         # top_level = BooleanQuery()
844         # Should = BooleanClause.Occur.SHOULD
845
846         # phrase_level = BooleanQuery()
847         # phrase_level.setBoost(1.3)
848
849         # p_content = self.make_phrase(tokens, joined=True)
850         # p_title = self.make_phrase(tokens, 'title')
851         # p_author = self.make_phrase(tokens, 'author')
852
853         # phrase_level.add(BooleanClause(p_content, Should))
854         # phrase_level.add(BooleanClause(p_title, Should))
855         # phrase_level.add(BooleanClause(p_author, Should))
856
857         # kw_level = BooleanQuery()
858
859         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
860         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
861         # kw_level.add(j_themes, Should)
862         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
863         # j_con = self.make_term_query(tokens, joined=True)
864         # kw_level.add(j_con, Should)
865
866         # top_level.add(BooleanClause(phrase_level, Should))
867         # top_level.add(BooleanClause(kw_level, Should))
868
869         return None
870
871     def book_search(self, query, filter=None, max_results=50, collector=None):
872         tops = self.searcher.search(query, filter, max_results)
873         #tops = self.searcher.search(p_content, max_results)
874
875         bks = []
876         for found in tops.scoreDocs:
877             doc = self.searcher.doc(found.doc)
878             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
879             bks.append(b)
880             print "%s (%d) -> %f" % (b, b.id, found.score)
881         return bks
882
883     def get_snippets(self, scoreDoc, query, field='content'):
884         htmlFormatter = SimpleHTMLFormatter()
885         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
886
887         stored = self.searcher.doc(scoreDoc.doc)
888
889         # locate content.
890         snippets = Snippets(stored.get('book_id')).open()
891         try:
892             text = snippets.get((int(stored.get('snippets_position')),
893                                  int(stored.get('snippets_length'))))
894         finally:
895             snippets.close()
896
897         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
898         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
899         #        import pdb; pdb.set_trace()
900         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
901
902         return [snip]
903
904     @staticmethod
905     def enum_to_array(enum):
906         """
907         Converts a lucene TermEnum to array of Terms, suitable for
908         addition to queries
909         """
910         terms = []
911
912         while True:
913             t = enum.term()
914             if t:
915                 terms.append(t)
916             if not enum.next(): break
917
918         if terms:
919             return JArray('object')(terms, Term)
920
921     def search_tags(self, query, filter=None, max_results=40):
922         tops = self.searcher.search(query, filter, max_results)
923
924         tags = []
925         for found in tops.scoreDocs:
926             doc = self.searcher.doc(found.doc)
927             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
928             tags.append(tag)
929             print "%s (%d) -> %f" % (tag, tag.id, found.score)
930
931         return tags
932
933     def create_prefix_phrase(self, toks, field):
934         q = MultiPhraseQuery()
935         for i in range(len(toks)):
936             t = Term(field, toks[i])
937             if i == len(toks) - 1:
938                 pterms = MultiSearch.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
939                 if pterms:
940                     q.add(pterms)
941                 else:
942                     q.add(t)
943             else:
944                 q.add(t)
945         return q
946
947     @staticmethod
948     def term_filter(term, inverse=False):
949         only_term = TermsFilter()
950         only_term.addTerm(term)
951
952         if inverse:
953             neg = BooleanFilter()
954             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
955             only_term = neg
956
957         return only_term
958
959     def hint_tags(self, string, max_results=50):
960         toks = self.get_tokens(string, field='SIMPLE')
961         top = BooleanQuery()
962
963         for field in ['tag_name', 'tag_name_pl']:
964             q = self.create_prefix_phrase(toks, field)
965             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
966
967         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
968
969         return self.search_tags(top, no_book_cat, max_results=max_results)
970
971     def hint_books(self, string, max_results=50):
972         toks = self.get_tokens(string, field='SIMPLE')
973
974         q = self.create_prefix_phrase(toks, 'title')
975
976         return self.book_search(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
977
978     @staticmethod
979     def chain_filters(filters, op=ChainedFilter.AND):
980         filters = filter(lambda x: x is not None, filters)
981         if not filters:
982             return None
983         chf = ChainedFilter(JArray('object')(filters, Filter), op)
984         return chf
985
986     def filtered_categories(self, tags):
987         cats = {}
988         for t in tags:
989             cats[t.category] = True
990         return cats.keys()
991
992     def hint(self):
993         return Hint(self)