Some refactoring & documentation changes in search.
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("authors", simple)
57         self.addAnalyzer("is_book", keyword)
58         # shouldn't the title have two forms? _pl and simple?
59
60         self.addAnalyzer("themes", simple)
61         self.addAnalyzer("themes_pl", polish)
62
63         self.addAnalyzer("tag_name", simple)
64         self.addAnalyzer("tag_name_pl", polish)
65
66         self.addAnalyzer("translators", simple)
67
68         self.addAnalyzer("KEYWORD", keyword)
69         self.addAnalyzer("SIMPLE", simple)
70         self.addAnalyzer("POLISH", polish)
71
72
73 class IndexStore(object):
74     """
75     Provides access to search index.
76
77     self.store - lucene index directory
78     """
79     def __init__(self):
80         self.make_index_dir()
81         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
82
83     def make_index_dir(self):
84         try:
85             os.makedirs(settings.SEARCH_INDEX)
86         except OSError as exc:
87             if exc.errno == errno.EEXIST:
88                 pass
89             else: raise
90
91
92 class IndexChecker(IndexStore):
93     def __init__(self):
94         IndexStore.__init__(self)
95
96     def check(self):
97         checker = CheckIndex(self.store)
98         status = checker.checkIndex()
99         return status
100
101
102 class Snippets(object):
103     """
104     This class manages snippet files for indexed object (book)
105     the snippets are concatenated together, and their positions and
106     lengths are kept in lucene index fields.
107     """
108     SNIPPET_DIR = "snippets"
109
110     def __init__(self, book_id):
111         try:
112             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
113         except OSError as exc:
114             if exc.errno == errno.EEXIST:
115                 pass
116             else: raise
117         self.book_id = book_id
118         self.file = None
119
120     def open(self, mode='r'):
121         """
122         Open the snippet file. Call .close() afterwards.
123         """
124         if not 'b' in mode:
125             mode += 'b'
126         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
127         self.position = 0
128         return self
129
130     def add(self, snippet):
131         """
132         Append a snippet (unicode) to the snippet file.
133         Return a (position, length) tuple
134         """
135         txt = snippet.encode('utf-8')
136         l = len(txt)
137         self.file.write(txt)
138         pos = (self.position, l)
139         self.position += l
140         return pos
141
142     def get(self, pos):
143         """
144         Given a tuple of (position, length) return an unicode
145         of the snippet stored there.
146         """
147         self.file.seek(pos[0], 0)
148         txt = self.file.read(pos[1]).decode('utf-8')
149         return txt
150
151     def close(self):
152         """Close snippet file"""
153         self.file.close()
154
155
156 class BaseIndex(IndexStore):
157     """
158     Base index class.
159     Provides basic operations on index: opening, closing, optimizing.
160     """
161     def __init__(self, analyzer=None):
162         super(BaseIndex, self).__init__()
163         self.index = None
164         if not analyzer:
165             analyzer = WLAnalyzer()
166         self.analyzer = analyzer
167
168     def open(self, analyzer=None):
169         if self.index:
170             raise Exception("Index is already opened")
171         self.index = IndexWriter(self.store, self.analyzer,\
172                                  IndexWriter.MaxFieldLength.LIMITED)
173         return self.index
174
175     def optimize(self):
176         self.index.optimize()
177
178     def close(self):
179         try:
180             self.index.optimize()
181         except JavaError, je:
182             print "Error during optimize phase, check index: %s" % je
183
184         self.index.close()
185         self.index = None
186
187     def __enter__(self):
188         self.open()
189         return self
190
191     def __exit__(self, type, value, tb):
192         self.close()
193
194
195 class Index(BaseIndex):
196     """
197     Class indexing books.
198     """
199     def __init__(self, analyzer=None):
200         super(Index, self).__init__(analyzer)
201
202     def index_tags(self):
203         """
204         Re-index global tag list.
205         Removes all tags from index, then index them again.
206         Indexed fields include: id, name (with and without polish stems), category
207         """
208         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
209         self.index.deleteDocuments(q)
210
211         for tag in catalogue.models.Tag.objects.all():
212             doc = Document()
213             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
214             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
215             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
216             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
217             self.index.addDocument(doc)
218
219     def create_book_doc(self, book):
220         """
221         Create a lucene document referring book id.
222         """
223         doc = Document()
224         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
225         if book.parent is not None:
226             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
227         return doc
228
229     def remove_book(self, book):
230         """Removes a book from search index.
231         book - Book instance."""
232         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
233         self.index.deleteDocuments(q)
234
235     def index_book(self, book, book_info=None, overwrite=True):
236         """
237         Indexes the book.
238         Creates a lucene document for extracted metadata
239         and calls self.index_content() to index the contents of the book.
240         """
241         if overwrite:
242             self.remove_book(book)
243
244         book_doc = self.create_book_doc(book)
245         meta_fields = self.extract_metadata(book, book_info)
246         for f in meta_fields.values():
247             if isinstance(f, list) or isinstance(f, tuple):
248                 for elem in f:
249                     book_doc.add(elem)
250             else:
251                 book_doc.add(f)
252
253         self.index.addDocument(book_doc)
254         del book_doc
255
256         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
257
258     master_tags = [
259         'opowiadanie',
260         'powiesc',
261         'dramat_wierszowany_l',
262         'dramat_wierszowany_lp',
263         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
264         'wywiad'
265         ]
266
267     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
268
269     def extract_metadata(self, book, book_info=None):
270         """
271         Extract metadata from book and returns a map of fields keyed by fieldname
272         """
273         fields = {}
274
275         if book_info is None:
276             book_info = dcparser.parse(open(book.xml_file.path))
277
278         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
279         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
280         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
281
282         # validator, name
283         for field in dcparser.BookInfo.FIELDS:
284             if hasattr(book_info, field.name):
285                 if not getattr(book_info, field.name):
286                     continue
287                 # since no type information is available, we use validator
288                 type_indicator = field.validator
289                 if type_indicator == dcparser.as_unicode:
290                     s = getattr(book_info, field.name)
291                     if field.multiple:
292                         s = ', '.join(s)
293                     try:
294                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
295                     except JavaError as je:
296                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
297                 elif type_indicator == dcparser.as_person:
298                     p = getattr(book_info, field.name)
299                     if isinstance(p, dcparser.Person):
300                         persons = unicode(p)
301                     else:
302                         persons = ', '.join(map(unicode, p))
303                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
304                 elif type_indicator == dcparser.as_date:
305                     dt = getattr(book_info, field.name)
306                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
307                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
308
309         return fields
310
311     def add_gaps(self, fields, fieldname):
312         """
313         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
314         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
315         """
316         def gap():
317             while True:
318                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
319         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
320
321     def get_master(self, root):
322         """
323         Returns the first master tag from an etree.
324         """
325         for master in root.iter():
326             if master.tag in self.master_tags:
327                 return master
328
329     def index_content(self, book, book_fields=[]):
330         """
331         Walks the book XML and extract content from it.
332         Adds parts for each header tag and for each fragment.
333         """
334         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
335         root = wld.edoc.getroot()
336
337         master = self.get_master(root)
338         if master is None:
339             return []
340
341         def walker(node):
342             yield node, None
343             for child in list(node):
344                 for b, e in walker(child):
345                     yield b, e
346             yield None, node
347             return
348
349         def fix_format(text):
350             return re.sub("(?m)/$", "", text)
351
352         def add_part(snippets, **fields):
353             doc = self.create_book_doc(book)
354             for f in book_fields:
355                 doc.add(f)
356
357             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
358             doc.add(NumericField("header_span", Field.Store.YES, True)\
359                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
360             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
361
362             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
363                           Field.TermVector.WITH_POSITIONS_OFFSETS))
364
365             snip_pos = snippets.add(fields["content"])
366             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
367             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
368
369             if 'fragment_anchor' in fields:
370                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
371                               Field.Store.YES, Field.Index.NOT_ANALYZED))
372
373             if 'themes' in fields:
374                 themes, themes_pl = zip(*[
375                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
376                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
377                      for theme in fields['themes']])
378
379                 themes = self.add_gaps(themes, 'themes')
380                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
381
382                 for t in themes:
383                     doc.add(t)
384                 for t in themes_pl:
385                     doc.add(t)
386
387             return doc
388
389         def give_me_utf8(s):
390             if isinstance(s, unicode):
391                 return s.encode('utf-8')
392             else:
393                 return s
394
395         fragments = {}
396         snippets = Snippets(book.id).open('w')
397         try:
398             for header, position in zip(list(master), range(len(master))):
399
400                 if header.tag in self.skip_header_tags:
401                     continue
402
403                 content = u' '.join([t for t in header.itertext()])
404                 content = fix_format(content)
405
406                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
407
408                 self.index.addDocument(doc)
409
410                 for start, end in walker(header):
411                     if start is not None and start.tag == 'begin':
412                         fid = start.attrib['id'][1:]
413                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
414                         fragments[fid]['content'].append(start.tail)
415                     elif start is not None and start.tag == 'motyw':
416                         fid = start.attrib['id'][1:]
417                         if start.text is not None:
418                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
419                         fragments[fid]['content'].append(start.tail)
420                     elif start is not None and start.tag == 'end':
421                         fid = start.attrib['id'][1:]
422                         if fid not in fragments:
423                             continue  # a broken <end> node, skip it
424                         frag = fragments[fid]
425                         if frag['themes'] == []:
426                             continue  # empty themes list.
427                         del fragments[fid]
428
429                         def jstr(l):
430                             return u' '.join(map(
431                                 lambda x: x == None and u'(none)' or unicode(x),
432                                 l))
433
434                         doc = add_part(snippets,
435                                        header_type=frag['start_header'],
436                                        header_index=frag['start_section'],
437                                        header_span=position - frag['start_section'] + 1,
438                                        fragment_anchor=fid,
439                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
440                                        themes=frag['themes'])
441
442                         self.index.addDocument(doc)
443                     elif start is not None:
444                         for frag in fragments.values():
445                             frag['content'].append(start.text)
446                     elif end is not None:
447                         for frag in fragments.values():
448                             frag['content'].append(end.tail)
449         finally:
450             snippets.close()
451
452
453 def log_exception_wrapper(f):
454     def _wrap(*a):
455         try:
456             f(*a)
457         except Exception, e:
458             print("Error in indexing thread: %s" % e)
459             traceback.print_exc()
460             raise e
461     return _wrap
462
463
464 class ReusableIndex(Index):
465     """
466     Works like index, but does not close/optimize Lucene index
467     until program exit (uses atexit hook).
468     This is usefull for importbooks command.
469
470     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
471     """
472     index = None
473
474     def open(self, analyzer=None, threads=4):
475         if ReusableIndex.index is not None:
476             self.index = ReusableIndex.index
477         else:
478             print("opening index")
479             Index.open(self, analyzer)
480             ReusableIndex.index = self.index
481             atexit.register(ReusableIndex.close_reusable)
482
483     # def index_book(self, *args, **kw):
484     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
485     #     ReusableIndex.pool_jobs.append(job)
486
487     @staticmethod
488     def close_reusable():
489         if ReusableIndex.index is not None:
490             ReusableIndex.index.optimize()
491             ReusableIndex.index.close()
492             ReusableIndex.index = None
493
494     def close(self):
495         pass
496
497
498 class JoinSearch(object):
499     """
500     This mixin could be used to handle block join queries.
501     (currently unused)
502     """
503     def __init__(self, *args, **kw):
504         super(JoinSearch, self).__init__(*args, **kw)
505
506     def wrapjoins(self, query, fields=[]):
507         """
508         This functions modifies the query in a recursive way,
509         so Term and Phrase Queries contained, which match
510         provided fields are wrapped in a BlockJoinQuery,
511         and so delegated to children documents.
512         """
513         if BooleanQuery.instance_(query):
514             qs = BooleanQuery.cast_(query)
515             for clause in qs:
516                 clause = BooleanClause.cast_(clause)
517                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
518             return qs
519         else:
520             termset = HashSet()
521             query.extractTerms(termset)
522             for t in termset:
523                 t = Term.cast_(t)
524                 if t.field() not in fields:
525                     return query
526             return BlockJoinQuery(query, self.parent_filter,
527                                   BlockJoinQuery.ScoreMode.Total)
528
529     def bsearch(self, query, max_results=50):
530         q = self.query(query)
531         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
532
533         tops = self.searcher.search(bjq, max_results)
534         bks = []
535         for found in tops.scoreDocs:
536             doc = self.searcher.doc(found.doc)
537             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
538         return (bks, tops.totalHits)
539
540
541 class SearchResult(object):
542     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
543         self.snippets = []
544
545         if score:
546             self.score = score
547         else:
548             self.score = scoreDocs.score
549
550         self.hits = []
551
552         stored = searcher.doc(scoreDocs.doc)
553         self.book_id = int(stored.get("book_id"))
554
555         header_type = stored.get("header_type")
556         if not header_type:
557             return
558
559         sec = (header_type, int(stored.get("header_index")))
560         header_span = stored.get('header_span')
561         header_span = header_span is not None and int(header_span) or 1
562
563         fragment = stored.get("fragment_anchor")
564
565         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': [snippets]})
566
567         self.hits.append(hit)
568
569     def merge(self, other):
570         if self.book_id != other.book_id:
571             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
572         self.hits += other.hits
573         if other.score > self.score:
574             self.score = other.score
575         return self
576
577     def get_book(self):
578         return catalogue.models.Book.objects.get(id=self.book_id)
579
580     book = property(get_book)
581
582     def process_hits(self):
583         frags = filter(lambda r: r[1] is not None, self.hits)
584         sect = filter(lambda r: r[1] is None, self.hits)
585         sect = filter(lambda s: 0 == len(filter(
586             lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
587             frags)), sect)
588
589         hits = []
590
591         for s in sect:
592             m = {'score': s[2],
593                  'header_index': s[0][1]
594                  }
595             m.update(s[3])
596             hits.append(m)
597
598         for f in frags:
599             frag = catalogue.models.Fragment.objects.get(anchor=f[1])
600             m = {'score': f[2],
601                  'fragment': frag,
602                  'themes': frag.tags.filter(category='theme')
603                  }
604             m.update(f[3])
605             hits.append(m)
606
607         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
608
609         print("--- %s" % hits)
610
611         return hits
612
613     def __unicode__(self):
614         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
615
616     @staticmethod
617     def aggregate(*result_lists):
618         books = {}
619         for rl in result_lists:
620             for r in rl:
621                 if r.book_id in books:
622                     books[r.book_id].merge(r)
623                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
624                 else:
625                     books[r.book_id] = r
626         return books.values()
627
628     def __cmp__(self, other):
629         return cmp(self.score, other.score)
630
631
632 class Hint(object):
633     """
634     Given some hint information (information we already know about)
635     our search target - like author, title (specific book), epoch, genre, kind
636     we can narrow down search using filters.
637     """
638     def __init__(self, search):
639         """
640         Accepts a Searcher instance.
641         """
642         self.search = search
643         self.book_tags = {}
644         self.part_tags = []
645         self._books = []
646
647     def books(self, *books):
648         """
649         Give a hint that we search these books.
650         """
651         self._books = books
652
653     def tags(self, tags):
654         """
655         Give a hint that these Tag objects (a list of)
656         is necessary.
657         """
658         for t in tags:
659             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
660                 lst = self.book_tags.get(t.category, [])
661                 lst.append(t)
662                 self.book_tags[t.category] = lst
663             if t.category in ['theme']:
664                 self.part_tags.append(t)
665
666     def tag_filter(self, tags, field='tags'):
667         """
668         Given a lsit of tags and an optional field (but they are normally in tags field)
669         returns a filter accepting only books with specific tags.
670         """
671         q = BooleanQuery()
672
673         for tag in tags:
674             toks = self.search.get_tokens(tag.name, field=field)
675             tag_phrase = PhraseQuery()
676             for tok in toks:
677                 tag_phrase.add(Term(field, tok))
678             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
679
680         return QueryWrapperFilter(q)
681
682     def book_filter(self):
683         """
684         Filters using book tags (all tag kinds except a theme)
685         """
686         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
687         if tags:
688             return self.tag_filter(tags)
689         else:
690             return None
691
692     def part_filter(self):
693         """
694         This filter can be used to look for book parts.
695         It filters on book id and/or themes.
696         """
697         fs = []
698         if self.part_tags:
699             fs.append(self.tag_filter(self.part_tags, field='themes'))
700
701         if self._books != []:
702             bf = BooleanFilter()
703             for b in self._books:
704                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
705                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
706             fs.append(bf)
707
708         return Search.chain_filters(fs)
709
710     def should_search_for_book(self):
711         return self._books == []
712
713     def just_search_in(self, all):
714         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
715         some = []
716         for field in all:
717             if field == 'authors' and 'author' in self.book_tags:
718                 continue
719             if field == 'title' and self._books != []:
720                 continue
721             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
722                 continue
723             some.append(field)
724         return some
725
726
727 class Search(IndexStore):
728     """
729     Search facilities.
730     """
731     def __init__(self, default_field="content"):
732         IndexStore.__init__(self)
733         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
734         # self.analyzer = WLAnalyzer()
735         self.searcher = IndexSearcher(self.store, True)
736         self.parser = QueryParser(Version.LUCENE_34, default_field,
737                                   self.analyzer)
738
739         self.parent_filter = TermsFilter()
740         self.parent_filter.addTerm(Term("is_book", "true"))
741
742     def query(self, query):
743         """Parse query in default Lucene Syntax. (for humans)
744         """
745         return self.parser.parse(query)
746
747     def simple_search(self, query, max_results=50):
748         """Runs a query for books using lucene syntax. (for humans)
749         Returns (books, total_hits)
750         """
751
752         tops = self.searcher.search(self.query(query), max_results)
753         bks = []
754         for found in tops.scoreDocs:
755             doc = self.searcher.doc(found.doc)
756             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
757         return (bks, tops.totalHits)
758
759     def get_tokens(self, searched, field='content'):
760         """returns tokens analyzed by a proper (for a field) analyzer
761         argument can be: StringReader, string/unicode, or tokens. In the last case
762         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
763         """
764         if isinstance(searched, str) or isinstance(searched, unicode):
765             searched = StringReader(searched)
766         elif isinstance(searched, list):
767             return searched
768
769         searched.reset()
770         tokens = self.analyzer.reusableTokenStream(field, searched)
771         toks = []
772         while tokens.incrementToken():
773             cta = tokens.getAttribute(CharTermAttribute.class_)
774             toks.append(cta.toString())
775         return toks
776
777     def fuzziness(self, fuzzy):
778         """Helper method to sanitize fuzziness"""
779         if not fuzzy:
780             return None
781         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
782             return fuzzy
783         else:
784             return 0.5
785
786     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
787         """
788         Return a PhraseQuery with a series of tokens.
789         """
790         if fuzzy:
791             phrase = MultiPhraseQuery()
792             for t in tokens:
793                 term = Term(field, t)
794                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
795                 fuzzterms = []
796
797                 while True:
798                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
799                     ft = fuzzterm.term()
800                     if ft:
801                         fuzzterms.append(ft)
802                     if not fuzzterm.next(): break
803                 if fuzzterms:
804                     phrase.add(JArray('object')(fuzzterms, Term))
805                 else:
806                     phrase.add(term)
807         else:
808             phrase = PhraseQuery()
809             phrase.setSlop(slop)
810             for t in tokens:
811                 term = Term(field, t)
812                 phrase.add(term)
813         return phrase
814
815     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
816         """
817         Returns term queries joined by boolean query.
818         modal - applies to boolean query
819         fuzzy - should the query by fuzzy.
820         """
821         q = BooleanQuery()
822         for t in tokens:
823             term = Term(field, t)
824             if fuzzy:
825                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
826             else:
827                 term = TermQuery(term)
828             q.add(BooleanClause(term, modal))
829         return q
830
831     # def content_query(self, query):
832     #     return BlockJoinQuery(query, self.parent_filter,
833     #                           BlockJoinQuery.ScoreMode.Total)
834
835     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
836         """
837         Search for perfect book matches. Just see if the query matches with some author or title,
838         taking hints into account.
839         """
840         fields_to_search = ['authors', 'title']
841         only_in = None
842         if hint:
843             if not hint.should_search_for_book():
844                 return []
845             fields_to_search = hint.just_search_in(fields_to_search)
846             only_in = hint.book_filter()
847
848         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
849
850         books = []
851         for q in qrys:
852             top = self.searcher.search(q,
853                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
854                 max_results)
855             for found in top.scoreDocs:
856                 books.append(SearchResult(self.searcher, found))
857         return books
858
859     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
860         """
861         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
862         some part/fragment of the book.
863         """
864         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
865
866         flt = None
867         if hint:
868             flt = hint.part_filter()
869
870         books = []
871         for q in qrys:
872             top = self.searcher.search(q,
873                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
874                                                            flt]),
875                                        max_results)
876             for found in top.scoreDocs:
877                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
878
879         return books
880
881     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
882         """
883         Tries to use search terms to match different fields of book (or its parts).
884         E.g. one word can be an author survey, another be a part of the title, and the rest
885         are some words from third chapter.
886         """
887         books = []
888         only_in = None
889
890         if hint:
891             only_in = hint.part_filter()
892
893         # content only query : themes x content
894         q = BooleanQuery()
895
896         tokens = self.get_tokens(searched)
897         if hint is None or hint.just_search_in(['themes_pl']) != []:
898             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
899                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
900
901         q.add(BooleanClause(self.make_term_query(tokens, field='content',
902                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
903
904         topDocs = self.searcher.search(q, only_in, max_results)
905         for found in topDocs.scoreDocs:
906             books.append(SearchResult(self.searcher, found))
907
908         # query themes/content x author/title/tags
909         q = BooleanQuery()
910         #        in_meta = BooleanQuery()
911         in_content = BooleanQuery()
912
913         for fld in ['themes', 'content', 'tags', 'authors', 'title']:
914             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
915
916         topDocs = self.searcher.search(q, only_in, max_results)
917         for found in topDocs.scoreDocs:
918             books.append(SearchResult(self.searcher, found))
919
920         return books
921
922     # def multisearch(self, query, max_results=50):
923     #     """
924     #     Search strategy:
925     #     - (phrase) OR -> content
926     #                   -> title
927     #                   -> authors
928     #     - (keywords)  -> authors
929     #                   -> motyw
930     #                   -> tags
931     #                   -> content
932     #     """
933         # queryreader = StringReader(query)
934         # tokens = self.get_tokens(queryreader)
935
936         # top_level = BooleanQuery()
937         # Should = BooleanClause.Occur.SHOULD
938
939         # phrase_level = BooleanQuery()
940         # phrase_level.setBoost(1.3)
941
942         # p_content = self.make_phrase(tokens, joined=True)
943         # p_title = self.make_phrase(tokens, 'title')
944         # p_author = self.make_phrase(tokens, 'author')
945
946         # phrase_level.add(BooleanClause(p_content, Should))
947         # phrase_level.add(BooleanClause(p_title, Should))
948         # phrase_level.add(BooleanClause(p_author, Should))
949
950         # kw_level = BooleanQuery()
951
952         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
953         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
954         # kw_level.add(j_themes, Should)
955         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
956         # j_con = self.make_term_query(tokens, joined=True)
957         # kw_level.add(j_con, Should)
958
959         # top_level.add(BooleanClause(phrase_level, Should))
960         # top_level.add(BooleanClause(kw_level, Should))
961
962         # return None
963
964
965     def get_snippets(self, scoreDoc, query, field='content'):
966         """
967         Returns a snippet for found scoreDoc.
968         """
969         htmlFormatter = SimpleHTMLFormatter()
970         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
971
972         stored = self.searcher.doc(scoreDoc.doc)
973
974         # locate content.
975         snippets = Snippets(stored.get('book_id')).open()
976         try:
977             text = snippets.get((int(stored.get('snippets_position')),
978                                  int(stored.get('snippets_length'))))
979         finally:
980             snippets.close()
981
982         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
983         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
984         #        import pdb; pdb.set_trace()
985         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
986
987         return snip
988
989     @staticmethod
990     def enum_to_array(enum):
991         """
992         Converts a lucene TermEnum to array of Terms, suitable for
993         addition to queries
994         """
995         terms = []
996
997         while True:
998             t = enum.term()
999             if t:
1000                 terms.append(t)
1001             if not enum.next(): break
1002
1003         if terms:
1004             return JArray('object')(terms, Term)
1005
1006     def search_tags(self, query, filter=None, max_results=40):
1007         """
1008         Search for Tag objects using query.
1009         """
1010         tops = self.searcher.search(query, filter, max_results)
1011
1012         tags = []
1013         for found in tops.scoreDocs:
1014             doc = self.searcher.doc(found.doc)
1015             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1016             tags.append(tag)
1017             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1018
1019         return tags
1020
1021     def search_books(self, query, filter=None, max_results=10):
1022         """
1023         Searches for Book objects using query
1024         """
1025         bks = []
1026         tops = self.searcher.search(query, filter, max_results)
1027         for found in tops.scoreDocs:
1028             doc = self.searcher.doc(found.doc)
1029             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1030         return bks
1031
1032     def create_prefix_phrase(self, toks, field):
1033         q = MultiPhraseQuery()
1034         for i in range(len(toks)):
1035             t = Term(field, toks[i])
1036             if i == len(toks) - 1:
1037                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1038                 if pterms:
1039                     q.add(pterms)
1040                 else:
1041                     q.add(t)
1042             else:
1043                 q.add(t)
1044         return q
1045
1046     @staticmethod
1047     def term_filter(term, inverse=False):
1048         only_term = TermsFilter()
1049         only_term.addTerm(term)
1050
1051         if inverse:
1052             neg = BooleanFilter()
1053             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1054             only_term = neg
1055
1056         return only_term
1057
1058     def hint_tags(self, string, max_results=50):
1059         """
1060         Return auto-complete hints for tags
1061         using prefix search.
1062         """
1063         toks = self.get_tokens(string, field='SIMPLE')
1064         top = BooleanQuery()
1065
1066         for field in ['tag_name', 'tag_name_pl']:
1067             q = self.create_prefix_phrase(toks, field)
1068             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1069
1070         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1071
1072         return self.search_tags(top, no_book_cat, max_results=max_results)
1073
1074     def hint_books(self, string, max_results=50):
1075         """
1076         Returns auto-complete hints for book titles
1077         Because we do not index 'pseudo' title-tags.
1078         Prefix search.
1079         """
1080         toks = self.get_tokens(string, field='SIMPLE')
1081
1082         q = self.create_prefix_phrase(toks, 'title')
1083
1084         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1085
1086     @staticmethod
1087     def chain_filters(filters, op=ChainedFilter.AND):
1088         """
1089         Chains a filter list together
1090         """
1091         filters = filter(lambda x: x is not None, filters)
1092         if not filters:
1093             return None
1094         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1095         return chf
1096
1097     def filtered_categories(self, tags):
1098         """
1099         Return a list of tag categories, present in tags list.
1100         """
1101         cats = {}
1102         for t in tags:
1103             cats[t.category] = True
1104         return cats.keys()
1105
1106     def hint(self):
1107         return Hint(self)