8b0cfb79fcd2c6d60b5248fba63e1453dc6815f6
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("author", simple)
57         self.addAnalyzer("is_book", keyword)
58         # shouldn't the title have two forms? _pl and simple?
59
60         self.addAnalyzer("themes", simple)
61         self.addAnalyzer("themes_pl", polish)
62
63         self.addAnalyzer("tag_name", simple)
64         self.addAnalyzer("tag_name_pl", polish)
65
66         self.addAnalyzer("translators", simple)
67
68         self.addAnalyzer("KEYWORD", keyword)
69         self.addAnalyzer("SIMPLE", simple)
70         self.addAnalyzer("POLISH", polish)
71
72
73 class IndexStore(object):
74     def __init__(self):
75         self.make_index_dir()
76         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
77
78     def make_index_dir(self):
79         try:
80             os.makedirs(settings.SEARCH_INDEX)
81         except OSError as exc:
82             if exc.errno == errno.EEXIST:
83                 pass
84             else: raise
85
86
87 class IndexChecker(IndexStore):
88     def __init__(self):
89         IndexStore.__init__(self)
90
91     def check(self):
92         checker = CheckIndex(self.store)
93         status = checker.checkIndex()
94         return status
95
96
97 class Snippets(object):
98     SNIPPET_DIR = "snippets"
99
100     def __init__(self, book_id):
101         try:
102             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
103         except OSError as exc:
104             if exc.errno == errno.EEXIST:
105                 pass
106             else: raise
107         self.book_id = book_id
108         self.file = None
109
110     def open(self, mode='r'):
111         if not 'b' in mode:
112             mode += 'b'
113         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
114         self.position = 0
115         return self
116
117     def add(self, snippet):
118         txt = snippet.encode('utf-8')
119         l = len(txt)
120         self.file.write(txt)
121         pos = (self.position, l)
122         self.position += l
123         return pos
124
125     def get(self, pos):
126         self.file.seek(pos[0], 0)
127         txt = self.file.read(pos[1]).decode('utf-8')
128         return txt
129
130     def close(self):
131         self.file.close()
132
133
134 class Index(IndexStore):
135     def __init__(self, analyzer=None):
136         IndexStore.__init__(self)
137         self.index = None
138         if not analyzer:
139             analyzer = WLAnalyzer()
140         self.analyzer = analyzer
141
142     def open(self, analyzer=None):
143         if self.index:
144             raise Exception("Index is already opened")
145         self.index = IndexWriter(self.store, self.analyzer,\
146                                  IndexWriter.MaxFieldLength.LIMITED)
147         return self.index
148
149     def optimize(self):
150         self.index.optimize()
151
152     def close(self):
153         try:
154             self.index.optimize()
155         except JavaError, je:
156             print "Error during optimize phase, check index: %s" % je
157
158         self.index.close()
159         self.index = None
160
161     def index_tags(self):
162         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
163         self.index.deleteDocuments(q)
164
165         for tag in catalogue.models.Tag.objects.all():
166             doc = Document()
167             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
168             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
169             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
170             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
171             self.index.addDocument(doc)
172
173     def remove_book(self, book):
174         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
175         self.index.deleteDocuments(q)
176
177     def index_book(self, book, overwrite=True):
178         if overwrite:
179             self.remove_book(book)
180
181         book_doc = self.create_book_doc(book)
182         meta_fields = self.extract_metadata(book)
183         for f in meta_fields.values():
184             if isinstance(f, list) or isinstance(f, tuple):
185                 for elem in f:
186                     book_doc.add(elem)
187             else:
188                 book_doc.add(f)
189
190         self.index.addDocument(book_doc)
191         del book_doc
192
193         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['author']])
194
195     master_tags = [
196         'opowiadanie',
197         'powiesc',
198         'dramat_wierszowany_l',
199         'dramat_wierszowany_lp',
200         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
201         'wywiad'
202         ]
203
204     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
205
206     def create_book_doc(self, book):
207         """
208         Create a lucene document connected to the book
209         """
210         doc = Document()
211         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
212         if book.parent is not None:
213             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
214         return doc
215
216     def extract_metadata(self, book):
217         fields = {}
218         book_info = dcparser.parse(open(book.xml_file.path))
219
220         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
221
222         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
223         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
224         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
225
226         # validator, name
227         for field in dcparser.BookInfo.FIELDS:
228             if hasattr(book_info, field.name):
229                 if not getattr(book_info, field.name):
230                     continue
231                 # since no type information is available, we use validator
232                 type_indicator = field.validator
233                 if type_indicator == dcparser.as_unicode:
234                     s = getattr(book_info, field.name)
235                     if field.multiple:
236                         s = ', '.join(s)
237                     try:
238                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
239                     except JavaError as je:
240                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
241                 elif type_indicator == dcparser.as_person:
242                     p = getattr(book_info, field.name)
243                     if isinstance(p, dcparser.Person):
244                         persons = unicode(p)
245                     else:
246                         persons = ', '.join(map(unicode, p))
247                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
248                 elif type_indicator == dcparser.as_date:
249                     dt = getattr(book_info, field.name)
250                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
251                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
252         return fields
253
254     def get_master(self, root):
255         for master in root.iter():
256             if master.tag in self.master_tags:
257                 return master
258
259     def add_gaps(self, fields, fieldname):
260         def gap():
261             while True:
262                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
263         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
264
265     def index_content(self, book, book_fields=[]):
266         wld = WLDocument.from_file(book.xml_file.path)
267         root = wld.edoc.getroot()
268
269         master = self.get_master(root)
270         if master is None:
271             return []
272
273         def walker(node):
274             yield node, None
275             for child in list(node):
276                 for b, e in walker(child):
277                     yield b, e
278             yield None, node
279             return
280
281         def fix_format(text):
282             return re.sub("/$", "", text, flags=re.M)
283
284         def add_part(snippets, **fields):
285             doc = self.create_book_doc(book)
286             for f in book_fields:
287                 doc.add(f)
288
289             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
290             doc.add(NumericField("header_span", Field.Store.YES, True)\
291                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
292             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
293
294             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
295                           Field.TermVector.WITH_POSITIONS_OFFSETS))
296
297             snip_pos = snippets.add(fields["content"])
298             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
299             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
300
301             if 'fragment_anchor' in fields:
302                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
303                               Field.Store.YES, Field.Index.NOT_ANALYZED))
304
305             if 'themes' in fields:
306                 themes, themes_pl = zip(*[
307                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
308                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
309                      for theme in fields['themes']])
310
311                 themes = self.add_gaps(themes, 'themes')
312                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
313
314                 for t in themes:
315                     doc.add(t)
316                 for t in themes_pl:
317                     doc.add(t)
318
319             return doc
320
321         def give_me_utf8(s):
322             if isinstance(s, unicode):
323                 return s.encode('utf-8')
324             else:
325                 return s
326
327
328         fragments = {}
329         snippets = Snippets(book.id).open('w')
330         try:
331             for header, position in zip(list(master), range(len(master))):
332
333                 if header.tag in self.skip_header_tags:
334                     continue
335
336                 content = u' '.join([t for t in header.itertext()])
337                 content = fix_format(content)
338
339                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
340
341                 self.index.addDocument(doc)
342
343                 for start, end in walker(header):
344                     if start is not None and start.tag == 'begin':
345                         fid = start.attrib['id'][1:]
346                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
347                         fragments[fid]['content'].append(start.tail)
348                     elif start is not None and start.tag == 'motyw':
349                         fid = start.attrib['id'][1:]
350                         if start.text is not None:
351                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
352                         fragments[fid]['content'].append(start.tail)
353                     elif start is not None and start.tag == 'end':
354                         fid = start.attrib['id'][1:]
355                         if fid not in fragments:
356                             continue  # a broken <end> node, skip it
357                         frag = fragments[fid]
358                         if frag['themes'] == []:
359                             continue  # empty themes list.
360                         del fragments[fid]
361
362                         def jstr(l):
363                             return u' '.join(map(
364                                 lambda x: x == None and u'(none)' or unicode(x),
365                                 l))
366
367                         doc = add_part(snippets,
368                                        header_type=frag['start_header'],
369                                        header_index=frag['start_section'],
370                                        header_span=position - frag['start_section'] + 1,
371                                        fragment_anchor=fid,
372                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
373                                        themes=frag['themes'])
374
375                         self.index.addDocument(doc)
376                     elif start is not None:
377                         for frag in fragments.values():
378                             frag['content'].append(start.text)
379                     elif end is not None:
380                         for frag in fragments.values():
381                             frag['content'].append(end.tail)
382         finally:
383             snippets.close()
384
385
386     def __enter__(self):
387         self.open()
388         return self
389
390     def __exit__(self, type, value, tb):
391         self.close()
392
393
394 def log_exception_wrapper(f):
395     def _wrap(*a):
396         try:
397             f(*a)
398         except Exception, e:
399             print("Error in indexing thread: %s" % e)
400             traceback.print_exc()
401             raise e
402     return _wrap
403
404
405 class ReusableIndex(Index):
406     """
407     Works like index, but does not close/optimize Lucene index
408     until program exit (uses atexit hook).
409     This is usefull for importbooks command.
410
411     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
412     """
413     index = None
414     pool = None
415     pool_jobs = None
416
417     def open(self, analyzer=None, threads=4):
418         if ReusableIndex.index is not None:
419             self.index = ReusableIndex.index
420         else:
421             print("opening index")
422             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
423             ReusableIndex.pool_jobs = []
424             Index.open(self, analyzer)
425             ReusableIndex.index = self.index
426             atexit.register(ReusableIndex.close_reusable)
427
428     def index_book(self, *args, **kw):
429         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
430         ReusableIndex.pool_jobs.append(job)
431
432     @staticmethod
433     def close_reusable():
434         if ReusableIndex.index is not None:
435             print("wait for indexing to finish")
436             for job in ReusableIndex.pool_jobs:
437                 job.get()
438                 sys.stdout.write('.')
439                 sys.stdout.flush()
440             print("done.")
441             ReusableIndex.pool.close()
442
443             ReusableIndex.index.optimize()
444             ReusableIndex.index.close()
445             ReusableIndex.index = None
446
447     def close(self):
448         pass
449
450
451 class Search(IndexStore):
452     def __init__(self, default_field="content"):
453         IndexStore.__init__(self)
454         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
455         ## self.analyzer = WLAnalyzer()
456         self.searcher = IndexSearcher(self.store, True)
457         self.parser = QueryParser(Version.LUCENE_34, default_field,
458                                   self.analyzer)
459
460         self.parent_filter = TermsFilter()
461         self.parent_filter.addTerm(Term("is_book", "true"))
462
463     def query(self, query):
464         return self.parser.parse(query)
465
466     def wrapjoins(self, query, fields=[]):
467         """
468         This functions modifies the query in a recursive way,
469         so Term and Phrase Queries contained, which match
470         provided fields are wrapped in a BlockJoinQuery,
471         and so delegated to children documents.
472         """
473         if BooleanQuery.instance_(query):
474             qs = BooleanQuery.cast_(query)
475             for clause in qs:
476                 clause = BooleanClause.cast_(clause)
477                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
478             return qs
479         else:
480             termset = HashSet()
481             query.extractTerms(termset)
482             for t in termset:
483                 t = Term.cast_(t)
484                 if t.field() not in fields:
485                     return query
486             return BlockJoinQuery(query, self.parent_filter,
487                                   BlockJoinQuery.ScoreMode.Total)
488
489     def simple_search(self, query, max_results=50):
490         """Returns (books, total_hits)
491         """
492
493         tops = self.searcher.search(self.query(query), max_results)
494         bks = []
495         for found in tops.scoreDocs:
496             doc = self.searcher.doc(found.doc)
497             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
498         return (bks, tops.totalHits)
499
500
501     def search(self, query, max_results=50):
502         query = self.query(query)
503         query = self.wrapjoins(query, ["content", "themes"])
504
505         tops = self.searcher.search(query, max_results)
506         bks = []
507         for found in tops.scoreDocs:
508             doc = self.searcher.doc(found.doc)
509             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
510         return (bks, tops.totalHits)
511
512     def bsearch(self, query, max_results=50):
513         q = self.query(query)
514         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
515
516         tops = self.searcher.search(bjq, max_results)
517         bks = []
518         for found in tops.scoreDocs:
519             doc = self.searcher.doc(found.doc)
520             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
521         return (bks, tops.totalHits)
522
523 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
524 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
525 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
526
527 # while (tokenStream.incrementToken()) {
528 #     int startOffset = offsetAttribute.startOffset();
529 #     int endOffset = offsetAttribute.endOffset();
530 #     String term = charTermAttribute.toString();
531 # }
532
533
534 class SearchResult(object):
535     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
536         self.snippets = []
537
538         if score:
539             self.score = score
540         else:
541             self.score = scoreDocs.score
542
543         self.hits = []
544
545         stored = searcher.doc(scoreDocs.doc)
546         self.book_id = int(stored.get("book_id"))
547
548         header_type = stored.get("header_type")
549         if not header_type:
550             return
551
552         sec = (header_type, int(stored.get("header_index")))
553         header_span = stored.get('header_span')
554         header_span = header_span is not None and int(header_span) or 1
555
556         fragment = stored.get("fragment_anchor")
557
558         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets})
559
560         self.hits.append(hit)
561
562     def merge(self, other):
563         if self.book_id != other.book_id:
564             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
565         self.hits += other.hits
566         if other.score > self.score:
567             self.score = other.score
568         return self
569
570     def get_book(self):
571         return catalogue.models.Book.objects.get(id=self.book_id)
572
573     book = property(get_book)
574
575     def process_hits(self):
576         frags = filter(lambda r: r[1] is not None, self.hits)
577         sect = filter(lambda r: r[1] is None, self.hits)
578         sect = filter(lambda s: 0 == len(filter(
579             lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
580             frags)), sect)
581
582         hits = []
583
584         for s in sect:
585             m = {'score': s[2],
586                  'header_index': s[0][1]
587                  }
588             m.update(s[3])
589             hits.append(m)
590
591         for f in frags:
592             frag = catalogue.models.Fragment.objects.get(anchor=f[1])
593             m = {'score': f[2],
594                  'fragment': frag,
595                  'themes': frag.tags.filter(category='theme')
596                  }
597             m.update(f[3])
598             hits.append(m)
599
600         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
601
602         print("--- %s" % hits)
603
604         return hits
605
606     def __unicode__(self):
607         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
608
609     @staticmethod
610     def aggregate(*result_lists):
611         books = {}
612         for rl in result_lists:
613             for r in rl:
614                 if r.book_id in books:
615                     books[r.book_id].merge(r)
616                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
617                 else:
618                     books[r.book_id] = r
619         return books.values()
620
621     def __cmp__(self, other):
622         return cmp(self.score, other.score)
623
624
625 class Hint(object):
626     def __init__(self, search):
627         self.search = search
628         self.book_tags = {}
629         self.part_tags = []
630         self._books = []
631
632     def books(self, *books):
633         self._books = books
634
635     def tags(self, tags):
636         for t in tags:
637             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
638                 lst = self.book_tags.get(t.category, [])
639                 lst.append(t)
640                 self.book_tags[t.category] = lst
641             if t.category in ['theme']:
642                 self.part_tags.append(t)
643
644     def tag_filter(self, tags, field='tags'):
645         q = BooleanQuery()
646
647         for tag in tags:
648             toks = self.search.get_tokens(tag.name, field=field)
649             tag_phrase = PhraseQuery()
650             for tok in toks:
651                 tag_phrase.add(Term(field, tok))
652             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
653
654         return QueryWrapperFilter(q)
655
656     def book_filter(self):
657         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
658         if tags:
659             return self.tag_filter(tags)
660         else:
661             return None
662
663     def part_filter(self):
664         fs = []
665         if self.part_tags:
666             fs.append(self.tag_filter(self.part_tags, field='themes'))
667             
668         if self._books != []:
669             bf = BooleanFilter()
670             for b in self._books:
671                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
672                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
673             fs.append(bf)
674             
675         return MultiSearch.chain_filters(fs)
676             
677     def should_search_for_book(self):
678         return self._books == []
679
680     def just_search_in(self, all):
681         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
682         some = []
683         for field in all:
684             if field == 'author' and 'author' in self.book_tags:
685                 continue
686             if field == 'title' and self._books != []:
687                 continue
688             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
689                 continue
690             some.append(field)
691         return some
692
693
694 class MultiSearch(Search):
695     """Class capable of IMDb-like searching"""
696     def get_tokens(self, searched, field='content'):
697         """returns tokens analyzed by a proper (for a field) analyzer
698         argument can be: StringReader, string/unicode, or tokens. In the last case
699         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
700         """
701         if isinstance(searched, str) or isinstance(searched, unicode):
702             searched = StringReader(searched)
703         elif isinstance(searched, list):
704             return searched
705
706         searched.reset()
707         tokens = self.analyzer.reusableTokenStream(field, searched)
708         toks = []
709         while tokens.incrementToken():
710             cta = tokens.getAttribute(CharTermAttribute.class_)
711             toks.append(cta.toString())
712         return toks
713
714     def fuzziness(self, fuzzy):
715         if not fuzzy:
716             return None
717         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
718             return fuzzy
719         else:
720             return 0.5
721
722     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
723         if fuzzy:
724             phrase = MultiPhraseQuery()
725             for t in tokens:
726                 term = Term(field, t)
727                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
728                 fuzzterms = []
729
730                 while True:
731                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
732                     ft = fuzzterm.term()
733                     if ft:
734                         fuzzterms.append(ft)
735                     if not fuzzterm.next(): break
736                 if fuzzterms:
737                     phrase.add(JArray('object')(fuzzterms, Term))
738                 else:
739                     phrase.add(term)
740         else:
741             phrase = PhraseQuery()
742             phrase.setSlop(slop)
743             for t in tokens:
744                 term = Term(field, t)
745                 phrase.add(term)
746         return phrase
747
748     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
749         q = BooleanQuery()
750         for t in tokens:
751             term = Term(field, t)
752             if fuzzy:
753                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
754             else:
755                 term = TermQuery(term)
756             q.add(BooleanClause(term, modal))
757         return q
758
759     # def content_query(self, query):
760     #     return BlockJoinQuery(query, self.parent_filter,
761     #                           BlockJoinQuery.ScoreMode.Total)
762
763     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
764         fields_to_search = ['author', 'title']
765         only_in = None
766         if hint:
767             if not hint.should_search_for_book():
768                 return []
769             fields_to_search = hint.just_search_in(fields_to_search)
770             only_in = hint.book_filter()
771
772         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
773
774         books = []
775         for q in qrys:
776             top = self.searcher.search(q,
777                                        self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
778                 max_results)
779             for found in top.scoreDocs:
780                 books.append(SearchResult(self.searcher, found))
781         return books
782
783     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
784         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
785
786         flt = None
787         if hint:
788             flt = hint.part_filter()
789
790         books = []
791         for q in qrys:
792             top = self.searcher.search(q,
793                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
794                                                            flt
795                                                           ]),
796                                        max_results)
797             for found in top.scoreDocs:
798                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
799
800         return books
801
802     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
803         books = []
804         only_in = None
805
806         if hint:
807             only_in = hint.part_filter()
808
809         # content only query : themes x content
810         q = BooleanQuery()
811
812         tokens = self.get_tokens(searched)
813         if hint is None or hint.just_search_in(['themes_pl']) != []:
814             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
815                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
816
817         q.add(BooleanClause(self.make_term_query(tokens, field='content',
818                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
819
820         topDocs = self.searcher.search(q, only_in, max_results)
821         for found in topDocs.scoreDocs:
822             books.append(SearchResult(self.searcher, found))
823
824         # query themes/content x author/title/tags
825         q = BooleanQuery()
826         in_meta = BooleanQuery()
827         in_content = BooleanQuery()
828
829         for fld in ['themes', 'content', 'tags', 'author', 'title']:
830             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
831
832         topDocs = self.searcher.search(q, only_in, max_results)
833         for found in topDocs.scoreDocs:
834             books.append(SearchResult(self.searcher, found))
835
836         return books
837     
838
839     def multisearch(self, query, max_results=50):
840         """
841         Search strategy:
842         - (phrase) OR -> content
843                       -> title
844                       -> author
845         - (keywords)  -> author
846                       -> motyw
847                       -> tags
848                       -> content
849         """
850         # queryreader = StringReader(query)
851         # tokens = self.get_tokens(queryreader)
852
853         # top_level = BooleanQuery()
854         # Should = BooleanClause.Occur.SHOULD
855
856         # phrase_level = BooleanQuery()
857         # phrase_level.setBoost(1.3)
858
859         # p_content = self.make_phrase(tokens, joined=True)
860         # p_title = self.make_phrase(tokens, 'title')
861         # p_author = self.make_phrase(tokens, 'author')
862
863         # phrase_level.add(BooleanClause(p_content, Should))
864         # phrase_level.add(BooleanClause(p_title, Should))
865         # phrase_level.add(BooleanClause(p_author, Should))
866
867         # kw_level = BooleanQuery()
868
869         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
870         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
871         # kw_level.add(j_themes, Should)
872         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
873         # j_con = self.make_term_query(tokens, joined=True)
874         # kw_level.add(j_con, Should)
875
876         # top_level.add(BooleanClause(phrase_level, Should))
877         # top_level.add(BooleanClause(kw_level, Should))
878
879         return None
880
881     def book_search(self, query, filter=None, max_results=50, collector=None):
882         tops = self.searcher.search(query, filter, max_results)
883         #tops = self.searcher.search(p_content, max_results)
884
885         bks = []
886         for found in tops.scoreDocs:
887             doc = self.searcher.doc(found.doc)
888             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
889             bks.append(b)
890             print "%s (%d) -> %f" % (b, b.id, found.score)
891         return bks
892
893     def get_snippets(self, scoreDoc, query, field='content'):
894         htmlFormatter = SimpleHTMLFormatter()
895         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
896
897         stored = self.searcher.doc(scoreDoc.doc)
898
899         # locate content.
900         snippets = Snippets(stored.get('book_id')).open()
901         try:
902             text = snippets.get((int(stored.get('snippets_position')),
903                                  int(stored.get('snippets_length'))))
904         finally:
905             snippets.close()
906
907         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
908         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
909         #        import pdb; pdb.set_trace()
910         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
911
912         return [snip]
913
914     @staticmethod
915     def enum_to_array(enum):
916         """
917         Converts a lucene TermEnum to array of Terms, suitable for
918         addition to queries
919         """
920         terms = []
921
922         while True:
923             t = enum.term()
924             if t:
925                 terms.append(t)
926             if not enum.next(): break
927
928         if terms:
929             return JArray('object')(terms, Term)
930
931     def search_tags(self, query, filter=None, max_results=40):
932         tops = self.searcher.search(query, filter, max_results)
933
934         tags = []
935         for found in tops.scoreDocs:
936             doc = self.searcher.doc(found.doc)
937             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
938             tags.append(tag)
939             print "%s (%d) -> %f" % (tag, tag.id, found.score)
940
941         return tags
942
943     def search_books(self, query, filter=None, max_results=10):
944         bks = []
945         tops = self.searcher.search(query, filter, max_results)
946         for found in tops.scoreDocs:
947             doc = self.searcher.doc(found.doc)
948             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
949         return bks
950
951     def create_prefix_phrase(self, toks, field):
952         q = MultiPhraseQuery()
953         for i in range(len(toks)):
954             t = Term(field, toks[i])
955             if i == len(toks) - 1:
956                 pterms = MultiSearch.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
957                 if pterms:
958                     q.add(pterms)
959                 else:
960                     q.add(t)
961             else:
962                 q.add(t)
963         return q
964
965     @staticmethod
966     def term_filter(term, inverse=False):
967         only_term = TermsFilter()
968         only_term.addTerm(term)
969
970         if inverse:
971             neg = BooleanFilter()
972             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
973             only_term = neg
974
975         return only_term
976
977     def hint_tags(self, string, max_results=50):
978         toks = self.get_tokens(string, field='SIMPLE')
979         top = BooleanQuery()
980
981         for field in ['tag_name', 'tag_name_pl']:
982             q = self.create_prefix_phrase(toks, field)
983             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
984
985         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
986
987         return self.search_tags(top, no_book_cat, max_results=max_results)
988
989     def hint_books(self, string, max_results=50):
990         toks = self.get_tokens(string, field='SIMPLE')
991
992         q = self.create_prefix_phrase(toks, 'title')
993
994         return self.book_search(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
995
996     @staticmethod
997     def chain_filters(filters, op=ChainedFilter.AND):
998         filters = filter(lambda x: x is not None, filters)
999         if not filters:
1000             return None
1001         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1002         return chf
1003
1004     def filtered_categories(self, tags):
1005         cats = {}
1006         for t in tags:
1007             cats[t.category] = True
1008         return cats.keys()
1009
1010     def hint(self):
1011         return Hint(self)