f4da66f10aa546ce1ac9fcb38671a5e9666ae011
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("authors", simple)
57         self.addAnalyzer("is_book", keyword)
58         # shouldn't the title have two forms? _pl and simple?
59
60         self.addAnalyzer("themes", simple)
61         self.addAnalyzer("themes_pl", polish)
62
63         self.addAnalyzer("tag_name", simple)
64         self.addAnalyzer("tag_name_pl", polish)
65
66         self.addAnalyzer("translators", simple)
67
68         self.addAnalyzer("KEYWORD", keyword)
69         self.addAnalyzer("SIMPLE", simple)
70         self.addAnalyzer("POLISH", polish)
71
72
73 class IndexStore(object):
74     def __init__(self):
75         self.make_index_dir()
76         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
77
78     def make_index_dir(self):
79         try:
80             os.makedirs(settings.SEARCH_INDEX)
81         except OSError as exc:
82             if exc.errno == errno.EEXIST:
83                 pass
84             else: raise
85
86
87 class IndexChecker(IndexStore):
88     def __init__(self):
89         IndexStore.__init__(self)
90
91     def check(self):
92         checker = CheckIndex(self.store)
93         status = checker.checkIndex()
94         return status
95
96
97 class Snippets(object):
98     SNIPPET_DIR = "snippets"
99
100     def __init__(self, book_id):
101         try:
102             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
103         except OSError as exc:
104             if exc.errno == errno.EEXIST:
105                 pass
106             else: raise
107         self.book_id = book_id
108         self.file = None
109
110     def open(self, mode='r'):
111         if not 'b' in mode:
112             mode += 'b'
113         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
114         self.position = 0
115         return self
116
117     def add(self, snippet):
118         txt = snippet.encode('utf-8')
119         l = len(txt)
120         self.file.write(txt)
121         pos = (self.position, l)
122         self.position += l
123         return pos
124
125     def get(self, pos):
126         self.file.seek(pos[0], 0)
127         txt = self.file.read(pos[1]).decode('utf-8')
128         return txt
129
130     def close(self):
131         self.file.close()
132
133
134 class Index(IndexStore):
135     def __init__(self, analyzer=None):
136         IndexStore.__init__(self)
137         self.index = None
138         if not analyzer:
139             analyzer = WLAnalyzer()
140         self.analyzer = analyzer
141
142     def open(self, analyzer=None):
143         if self.index:
144             raise Exception("Index is already opened")
145         self.index = IndexWriter(self.store, self.analyzer,\
146                                  IndexWriter.MaxFieldLength.LIMITED)
147         return self.index
148
149     def optimize(self):
150         self.index.optimize()
151
152     def close(self):
153         try:
154             self.index.optimize()
155         except JavaError, je:
156             print "Error during optimize phase, check index: %s" % je
157
158         self.index.close()
159         self.index = None
160
161     def index_tags(self):
162         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
163         self.index.deleteDocuments(q)
164
165         for tag in catalogue.models.Tag.objects.all():
166             doc = Document()
167             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
168             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
169             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
170             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
171             self.index.addDocument(doc)
172
173     def remove_book(self, book):
174         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
175         self.index.deleteDocuments(q)
176
177     def index_book(self, book, book_info=None, overwrite=True):
178         if overwrite:
179             self.remove_book(book)
180
181         book_doc = self.create_book_doc(book)
182         meta_fields = self.extract_metadata(book, book_info)
183         for f in meta_fields.values():
184             if isinstance(f, list) or isinstance(f, tuple):
185                 for elem in f:
186                     book_doc.add(elem)
187             else:
188                 book_doc.add(f)
189
190         self.index.addDocument(book_doc)
191         del book_doc
192
193         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
194
195     master_tags = [
196         'opowiadanie',
197         'powiesc',
198         'dramat_wierszowany_l',
199         'dramat_wierszowany_lp',
200         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
201         'wywiad'
202         ]
203
204     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
205
206     def create_book_doc(self, book):
207         """
208         Create a lucene document connected to the book
209         """
210         doc = Document()
211         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
212         if book.parent is not None:
213             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
214         return doc
215
216     def extract_metadata(self, book, book_info=None):
217         fields = {}
218
219         if book_info is None:
220             book_info = dcparser.parse(open(book.xml_file.path))
221
222         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
223         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
224         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
225
226         # validator, name
227         for field in dcparser.BookInfo.FIELDS:
228             if hasattr(book_info, field.name):
229                 if not getattr(book_info, field.name):
230                     continue
231                 # since no type information is available, we use validator
232                 type_indicator = field.validator
233                 if type_indicator == dcparser.as_unicode:
234                     s = getattr(book_info, field.name)
235                     if field.multiple:
236                         s = ', '.join(s)
237                     try:
238                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
239                     except JavaError as je:
240                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
241                 elif type_indicator == dcparser.as_person:
242                     p = getattr(book_info, field.name)
243                     if isinstance(p, dcparser.Person):
244                         persons = unicode(p)
245                     else:
246                         persons = ', '.join(map(unicode, p))
247                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
248                 elif type_indicator == dcparser.as_date:
249                     dt = getattr(book_info, field.name)
250                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
251                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
252
253         return fields
254
255     def get_master(self, root):
256         for master in root.iter():
257             if master.tag in self.master_tags:
258                 return master
259
260     def add_gaps(self, fields, fieldname):
261         def gap():
262             while True:
263                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
264         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
265
266     def index_content(self, book, book_fields=[]):
267         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
268         root = wld.edoc.getroot()
269
270         master = self.get_master(root)
271         if master is None:
272             return []
273
274         def walker(node):
275             yield node, None
276             for child in list(node):
277                 for b, e in walker(child):
278                     yield b, e
279             yield None, node
280             return
281
282         def fix_format(text):
283             return re.sub("/$", "", text, flags=re.M)
284
285         def add_part(snippets, **fields):
286             doc = self.create_book_doc(book)
287             for f in book_fields:
288                 doc.add(f)
289
290             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
291             doc.add(NumericField("header_span", Field.Store.YES, True)\
292                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
293             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
294
295             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
296                           Field.TermVector.WITH_POSITIONS_OFFSETS))
297
298             snip_pos = snippets.add(fields["content"])
299             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
300             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
301
302             if 'fragment_anchor' in fields:
303                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
304                               Field.Store.YES, Field.Index.NOT_ANALYZED))
305
306             if 'themes' in fields:
307                 themes, themes_pl = zip(*[
308                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
309                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
310                      for theme in fields['themes']])
311
312                 themes = self.add_gaps(themes, 'themes')
313                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
314
315                 for t in themes:
316                     doc.add(t)
317                 for t in themes_pl:
318                     doc.add(t)
319
320             return doc
321
322         def give_me_utf8(s):
323             if isinstance(s, unicode):
324                 return s.encode('utf-8')
325             else:
326                 return s
327
328
329         fragments = {}
330         snippets = Snippets(book.id).open('w')
331         try:
332             for header, position in zip(list(master), range(len(master))):
333
334                 if header.tag in self.skip_header_tags:
335                     continue
336
337                 content = u' '.join([t for t in header.itertext()])
338                 content = fix_format(content)
339
340                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
341
342                 self.index.addDocument(doc)
343
344                 for start, end in walker(header):
345                     if start is not None and start.tag == 'begin':
346                         fid = start.attrib['id'][1:]
347                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
348                         fragments[fid]['content'].append(start.tail)
349                     elif start is not None and start.tag == 'motyw':
350                         fid = start.attrib['id'][1:]
351                         if start.text is not None:
352                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
353                         fragments[fid]['content'].append(start.tail)
354                     elif start is not None and start.tag == 'end':
355                         fid = start.attrib['id'][1:]
356                         if fid not in fragments:
357                             continue  # a broken <end> node, skip it
358                         frag = fragments[fid]
359                         if frag['themes'] == []:
360                             continue  # empty themes list.
361                         del fragments[fid]
362
363                         def jstr(l):
364                             return u' '.join(map(
365                                 lambda x: x == None and u'(none)' or unicode(x),
366                                 l))
367
368                         doc = add_part(snippets,
369                                        header_type=frag['start_header'],
370                                        header_index=frag['start_section'],
371                                        header_span=position - frag['start_section'] + 1,
372                                        fragment_anchor=fid,
373                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
374                                        themes=frag['themes'])
375
376                         self.index.addDocument(doc)
377                     elif start is not None:
378                         for frag in fragments.values():
379                             frag['content'].append(start.text)
380                     elif end is not None:
381                         for frag in fragments.values():
382                             frag['content'].append(end.tail)
383         finally:
384             snippets.close()
385
386
387     def __enter__(self):
388         self.open()
389         return self
390
391     def __exit__(self, type, value, tb):
392         self.close()
393
394
395 def log_exception_wrapper(f):
396     def _wrap(*a):
397         try:
398             f(*a)
399         except Exception, e:
400             print("Error in indexing thread: %s" % e)
401             traceback.print_exc()
402             raise e
403     return _wrap
404
405
406 class ReusableIndex(Index):
407     """
408     Works like index, but does not close/optimize Lucene index
409     until program exit (uses atexit hook).
410     This is usefull for importbooks command.
411
412     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
413     """
414     index = None
415     pool = None
416     pool_jobs = None
417
418     def open(self, analyzer=None, threads=4):
419         if ReusableIndex.index is not None:
420             self.index = ReusableIndex.index
421         else:
422             print("opening index")
423             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
424             ReusableIndex.pool_jobs = []
425             Index.open(self, analyzer)
426             ReusableIndex.index = self.index
427             atexit.register(ReusableIndex.close_reusable)
428
429     def index_book(self, *args, **kw):
430         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
431         ReusableIndex.pool_jobs.append(job)
432
433     @staticmethod
434     def close_reusable():
435         if ReusableIndex.index is not None:
436             print("wait for indexing to finish")
437             for job in ReusableIndex.pool_jobs:
438                 job.get()
439                 sys.stdout.write('.')
440                 sys.stdout.flush()
441             print("done.")
442             ReusableIndex.pool.close()
443
444             ReusableIndex.index.optimize()
445             ReusableIndex.index.close()
446             ReusableIndex.index = None
447
448     def close(self):
449         pass
450
451
452 class Search(IndexStore):
453     def __init__(self, default_field="content"):
454         IndexStore.__init__(self)
455         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
456         ## self.analyzer = WLAnalyzer()
457         self.searcher = IndexSearcher(self.store, True)
458         self.parser = QueryParser(Version.LUCENE_34, default_field,
459                                   self.analyzer)
460
461         self.parent_filter = TermsFilter()
462         self.parent_filter.addTerm(Term("is_book", "true"))
463
464     def query(self, query):
465         return self.parser.parse(query)
466
467     def wrapjoins(self, query, fields=[]):
468         """
469         This functions modifies the query in a recursive way,
470         so Term and Phrase Queries contained, which match
471         provided fields are wrapped in a BlockJoinQuery,
472         and so delegated to children documents.
473         """
474         if BooleanQuery.instance_(query):
475             qs = BooleanQuery.cast_(query)
476             for clause in qs:
477                 clause = BooleanClause.cast_(clause)
478                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
479             return qs
480         else:
481             termset = HashSet()
482             query.extractTerms(termset)
483             for t in termset:
484                 t = Term.cast_(t)
485                 if t.field() not in fields:
486                     return query
487             return BlockJoinQuery(query, self.parent_filter,
488                                   BlockJoinQuery.ScoreMode.Total)
489
490     def simple_search(self, query, max_results=50):
491         """Returns (books, total_hits)
492         """
493
494         tops = self.searcher.search(self.query(query), max_results)
495         bks = []
496         for found in tops.scoreDocs:
497             doc = self.searcher.doc(found.doc)
498             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
499         return (bks, tops.totalHits)
500
501
502     def search(self, query, max_results=50):
503         query = self.query(query)
504         query = self.wrapjoins(query, ["content", "themes"])
505
506         tops = self.searcher.search(query, max_results)
507         bks = []
508         for found in tops.scoreDocs:
509             doc = self.searcher.doc(found.doc)
510             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
511         return (bks, tops.totalHits)
512
513     def bsearch(self, query, max_results=50):
514         q = self.query(query)
515         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
516
517         tops = self.searcher.search(bjq, max_results)
518         bks = []
519         for found in tops.scoreDocs:
520             doc = self.searcher.doc(found.doc)
521             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
522         return (bks, tops.totalHits)
523
524 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
525 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
526 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
527
528 # while (tokenStream.incrementToken()) {
529 #     int startOffset = offsetAttribute.startOffset();
530 #     int endOffset = offsetAttribute.endOffset();
531 #     String term = charTermAttribute.toString();
532 # }
533
534
535 class SearchResult(object):
536     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
537         self.snippets = []
538
539         if score:
540             self.score = score
541         else:
542             self.score = scoreDocs.score
543
544         self.hits = []
545
546         stored = searcher.doc(scoreDocs.doc)
547         self.book_id = int(stored.get("book_id"))
548
549         header_type = stored.get("header_type")
550         if not header_type:
551             return
552
553         sec = (header_type, int(stored.get("header_index")))
554         header_span = stored.get('header_span')
555         header_span = header_span is not None and int(header_span) or 1
556
557         fragment = stored.get("fragment_anchor")
558
559         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets})
560
561         self.hits.append(hit)
562
563     def merge(self, other):
564         if self.book_id != other.book_id:
565             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
566         self.hits += other.hits
567         if other.score > self.score:
568             self.score = other.score
569         return self
570
571     def get_book(self):
572         return catalogue.models.Book.objects.get(id=self.book_id)
573
574     book = property(get_book)
575
576     def process_hits(self):
577         frags = filter(lambda r: r[1] is not None, self.hits)
578         sect = filter(lambda r: r[1] is None, self.hits)
579         sect = filter(lambda s: 0 == len(filter(
580             lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
581             frags)), sect)
582
583         hits = []
584
585         for s in sect:
586             m = {'score': s[2],
587                  'header_index': s[0][1]
588                  }
589             m.update(s[3])
590             hits.append(m)
591
592         for f in frags:
593             frag = catalogue.models.Fragment.objects.get(anchor=f[1])
594             m = {'score': f[2],
595                  'fragment': frag,
596                  'themes': frag.tags.filter(category='theme')
597                  }
598             m.update(f[3])
599             hits.append(m)
600
601         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
602
603         print("--- %s" % hits)
604
605         return hits
606
607     def __unicode__(self):
608         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
609
610     @staticmethod
611     def aggregate(*result_lists):
612         books = {}
613         for rl in result_lists:
614             for r in rl:
615                 if r.book_id in books:
616                     books[r.book_id].merge(r)
617                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
618                 else:
619                     books[r.book_id] = r
620         return books.values()
621
622     def __cmp__(self, other):
623         return cmp(self.score, other.score)
624
625
626 class Hint(object):
627     def __init__(self, search):
628         self.search = search
629         self.book_tags = {}
630         self.part_tags = []
631         self._books = []
632
633     def books(self, *books):
634         self._books = books
635
636     def tags(self, tags):
637         for t in tags:
638             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
639                 lst = self.book_tags.get(t.category, [])
640                 lst.append(t)
641                 self.book_tags[t.category] = lst
642             if t.category in ['theme']:
643                 self.part_tags.append(t)
644
645     def tag_filter(self, tags, field='tags'):
646         q = BooleanQuery()
647
648         for tag in tags:
649             toks = self.search.get_tokens(tag.name, field=field)
650             tag_phrase = PhraseQuery()
651             for tok in toks:
652                 tag_phrase.add(Term(field, tok))
653             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
654
655         return QueryWrapperFilter(q)
656
657     def book_filter(self):
658         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
659         if tags:
660             return self.tag_filter(tags)
661         else:
662             return None
663
664     def part_filter(self):
665         fs = []
666         if self.part_tags:
667             fs.append(self.tag_filter(self.part_tags, field='themes'))
668
669         if self._books != []:
670             bf = BooleanFilter()
671             for b in self._books:
672                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
673                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
674             fs.append(bf)
675
676         return MultiSearch.chain_filters(fs)
677
678     def should_search_for_book(self):
679         return self._books == []
680
681     def just_search_in(self, all):
682         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
683         some = []
684         for field in all:
685             if field == 'authors' and 'author' in self.book_tags:
686                 continue
687             if field == 'title' and self._books != []:
688                 continue
689             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
690                 continue
691             some.append(field)
692         return some
693
694
695 class MultiSearch(Search):
696     """Class capable of IMDb-like searching"""
697     def get_tokens(self, searched, field='content'):
698         """returns tokens analyzed by a proper (for a field) analyzer
699         argument can be: StringReader, string/unicode, or tokens. In the last case
700         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
701         """
702         if isinstance(searched, str) or isinstance(searched, unicode):
703             searched = StringReader(searched)
704         elif isinstance(searched, list):
705             return searched
706
707         searched.reset()
708         tokens = self.analyzer.reusableTokenStream(field, searched)
709         toks = []
710         while tokens.incrementToken():
711             cta = tokens.getAttribute(CharTermAttribute.class_)
712             toks.append(cta.toString())
713         return toks
714
715     def fuzziness(self, fuzzy):
716         if not fuzzy:
717             return None
718         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
719             return fuzzy
720         else:
721             return 0.5
722
723     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
724         if fuzzy:
725             phrase = MultiPhraseQuery()
726             for t in tokens:
727                 term = Term(field, t)
728                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
729                 fuzzterms = []
730
731                 while True:
732                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
733                     ft = fuzzterm.term()
734                     if ft:
735                         fuzzterms.append(ft)
736                     if not fuzzterm.next(): break
737                 if fuzzterms:
738                     phrase.add(JArray('object')(fuzzterms, Term))
739                 else:
740                     phrase.add(term)
741         else:
742             phrase = PhraseQuery()
743             phrase.setSlop(slop)
744             for t in tokens:
745                 term = Term(field, t)
746                 phrase.add(term)
747         return phrase
748
749     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
750         q = BooleanQuery()
751         for t in tokens:
752             term = Term(field, t)
753             if fuzzy:
754                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
755             else:
756                 term = TermQuery(term)
757             q.add(BooleanClause(term, modal))
758         return q
759
760     # def content_query(self, query):
761     #     return BlockJoinQuery(query, self.parent_filter,
762     #                           BlockJoinQuery.ScoreMode.Total)
763
764     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
765         fields_to_search = ['authors', 'title']
766         only_in = None
767         if hint:
768             if not hint.should_search_for_book():
769                 return []
770             fields_to_search = hint.just_search_in(fields_to_search)
771             only_in = hint.book_filter()
772
773         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
774
775         books = []
776         for q in qrys:
777             top = self.searcher.search(q,
778                                        self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
779                 max_results)
780             for found in top.scoreDocs:
781                 books.append(SearchResult(self.searcher, found))
782         return books
783
784     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
785         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
786
787         flt = None
788         if hint:
789             flt = hint.part_filter()
790
791         books = []
792         for q in qrys:
793             top = self.searcher.search(q,
794                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
795                                                            flt
796                                                           ]),
797                                        max_results)
798             for found in top.scoreDocs:
799                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
800
801         return books
802
803     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
804         books = []
805         only_in = None
806
807         if hint:
808             only_in = hint.part_filter()
809
810         # content only query : themes x content
811         q = BooleanQuery()
812
813         tokens = self.get_tokens(searched)
814         if hint is None or hint.just_search_in(['themes_pl']) != []:
815             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
816                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
817
818         q.add(BooleanClause(self.make_term_query(tokens, field='content',
819                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
820
821         topDocs = self.searcher.search(q, only_in, max_results)
822         for found in topDocs.scoreDocs:
823             books.append(SearchResult(self.searcher, found))
824
825         # query themes/content x author/title/tags
826         q = BooleanQuery()
827         #        in_meta = BooleanQuery()
828         in_content = BooleanQuery()
829
830         for fld in ['themes', 'content', 'tags', 'authors', 'title']:
831             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
832
833         topDocs = self.searcher.search(q, only_in, max_results)
834         for found in topDocs.scoreDocs:
835             books.append(SearchResult(self.searcher, found))
836
837         return books
838
839     def multisearch(self, query, max_results=50):
840         """
841         Search strategy:
842         - (phrase) OR -> content
843                       -> title
844                       -> authors
845         - (keywords)  -> authors
846                       -> motyw
847                       -> tags
848                       -> content
849         """
850         # queryreader = StringReader(query)
851         # tokens = self.get_tokens(queryreader)
852
853         # top_level = BooleanQuery()
854         # Should = BooleanClause.Occur.SHOULD
855
856         # phrase_level = BooleanQuery()
857         # phrase_level.setBoost(1.3)
858
859         # p_content = self.make_phrase(tokens, joined=True)
860         # p_title = self.make_phrase(tokens, 'title')
861         # p_author = self.make_phrase(tokens, 'author')
862
863         # phrase_level.add(BooleanClause(p_content, Should))
864         # phrase_level.add(BooleanClause(p_title, Should))
865         # phrase_level.add(BooleanClause(p_author, Should))
866
867         # kw_level = BooleanQuery()
868
869         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
870         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
871         # kw_level.add(j_themes, Should)
872         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
873         # j_con = self.make_term_query(tokens, joined=True)
874         # kw_level.add(j_con, Should)
875
876         # top_level.add(BooleanClause(phrase_level, Should))
877         # top_level.add(BooleanClause(kw_level, Should))
878
879         return None
880
881     def book_search(self, query, filter=None, max_results=50, collector=None):
882         tops = self.searcher.search(query, filter, max_results)
883         #tops = self.searcher.search(p_content, max_results)
884
885         bks = []
886         for found in tops.scoreDocs:
887             doc = self.searcher.doc(found.doc)
888             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
889             bks.append(b)
890             print "%s (%d) -> %f" % (b, b.id, found.score)
891         return bks
892
893     def get_snippets(self, scoreDoc, query, field='content'):
894         htmlFormatter = SimpleHTMLFormatter()
895         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
896
897         stored = self.searcher.doc(scoreDoc.doc)
898
899         # locate content.
900         snippets = Snippets(stored.get('book_id')).open()
901         try:
902             text = snippets.get((int(stored.get('snippets_position')),
903                                  int(stored.get('snippets_length'))))
904         finally:
905             snippets.close()
906
907         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
908         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
909         #        import pdb; pdb.set_trace()
910         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
911
912         return [snip]
913
914     @staticmethod
915     def enum_to_array(enum):
916         """
917         Converts a lucene TermEnum to array of Terms, suitable for
918         addition to queries
919         """
920         terms = []
921
922         while True:
923             t = enum.term()
924             if t:
925                 terms.append(t)
926             if not enum.next(): break
927
928         if terms:
929             return JArray('object')(terms, Term)
930
931     def search_tags(self, query, filter=None, max_results=40):
932         tops = self.searcher.search(query, filter, max_results)
933
934         tags = []
935         for found in tops.scoreDocs:
936             doc = self.searcher.doc(found.doc)
937             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
938             tags.append(tag)
939             print "%s (%d) -> %f" % (tag, tag.id, found.score)
940
941         return tags
942
943     def search_books(self, query, filter=None, max_results=10):
944         bks = []
945         tops = self.searcher.search(query, filter, max_results)
946         for found in tops.scoreDocs:
947             doc = self.searcher.doc(found.doc)
948             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
949         return bks
950
951     def create_prefix_phrase(self, toks, field):
952         q = MultiPhraseQuery()
953         for i in range(len(toks)):
954             t = Term(field, toks[i])
955             if i == len(toks) - 1:
956                 pterms = MultiSearch.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
957                 if pterms:
958                     q.add(pterms)
959                 else:
960                     q.add(t)
961             else:
962                 q.add(t)
963         return q
964
965     @staticmethod
966     def term_filter(term, inverse=False):
967         only_term = TermsFilter()
968         only_term.addTerm(term)
969
970         if inverse:
971             neg = BooleanFilter()
972             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
973             only_term = neg
974
975         return only_term
976
977     def hint_tags(self, string, max_results=50):
978         toks = self.get_tokens(string, field='SIMPLE')
979         top = BooleanQuery()
980
981         for field in ['tag_name', 'tag_name_pl']:
982             q = self.create_prefix_phrase(toks, field)
983             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
984
985         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
986
987         return self.search_tags(top, no_book_cat, max_results=max_results)
988
989     def hint_books(self, string, max_results=50):
990         toks = self.get_tokens(string, field='SIMPLE')
991
992         q = self.create_prefix_phrase(toks, 'title')
993
994         return self.book_search(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
995
996     @staticmethod
997     def chain_filters(filters, op=ChainedFilter.AND):
998         filters = filter(lambda x: x is not None, filters)
999         if not filters:
1000             return None
1001         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1002         return chf
1003
1004     def filtered_categories(self, tags):
1005         cats = {}
1006         for t in tags:
1007             cats[t.category] = True
1008         return cats.keys()
1009
1010     def hint(self):
1011         return Hint(self)