search - new result layout. the css is not perfect though.
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from multiprocessing.pool import ThreadPool
31 from threading import current_thread
32 import atexit
33 import traceback
34
35
36 class WLAnalyzer(PerFieldAnalyzerWrapper):
37     def __init__(self):
38         polish = PolishAnalyzer(Version.LUCENE_34)
39         #        polish_gap.setPositionIncrementGap(999)
40
41         simple = SimpleAnalyzer(Version.LUCENE_34)
42         #        simple_gap.setPositionIncrementGap(999)
43
44         keyword = KeywordAnalyzer(Version.LUCENE_34)
45
46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
47
48         PerFieldAnalyzerWrapper.__init__(self, polish)
49
50         self.addAnalyzer("tags", simple)
51         self.addAnalyzer("technical_editors", simple)
52         self.addAnalyzer("editors", simple)
53         self.addAnalyzer("url", keyword)
54         self.addAnalyzer("source_url", keyword)
55         self.addAnalyzer("source_name", simple)
56         self.addAnalyzer("publisher", simple)
57         self.addAnalyzer("authors", simple)
58         self.addAnalyzer("title", simple)
59
60         self.addAnalyzer("is_book", keyword)
61         # shouldn't the title have two forms? _pl and simple?
62
63         self.addAnalyzer("themes", simple)
64         self.addAnalyzer("themes_pl", polish)
65
66         self.addAnalyzer("tag_name", simple)
67         self.addAnalyzer("tag_name_pl", polish)
68
69         self.addAnalyzer("translators", simple)
70
71         self.addAnalyzer("KEYWORD", keyword)
72         self.addAnalyzer("SIMPLE", simple)
73         self.addAnalyzer("POLISH", polish)
74
75
76 class IndexStore(object):
77     """
78     Provides access to search index.
79
80     self.store - lucene index directory
81     """
82     def __init__(self):
83         self.make_index_dir()
84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
85
86     def make_index_dir(self):
87         try:
88             os.makedirs(settings.SEARCH_INDEX)
89         except OSError as exc:
90             if exc.errno == errno.EEXIST:
91                 pass
92             else: raise
93
94
95 class IndexChecker(IndexStore):
96     def __init__(self):
97         IndexStore.__init__(self)
98
99     def check(self):
100         checker = CheckIndex(self.store)
101         status = checker.checkIndex()
102         return status
103
104
105 class Snippets(object):
106     """
107     This class manages snippet files for indexed object (book)
108     the snippets are concatenated together, and their positions and
109     lengths are kept in lucene index fields.
110     """
111     SNIPPET_DIR = "snippets"
112
113     def __init__(self, book_id):
114         try:
115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
116         except OSError as exc:
117             if exc.errno == errno.EEXIST:
118                 pass
119             else: raise
120         self.book_id = book_id
121         self.file = None
122
123     def open(self, mode='r'):
124         """
125         Open the snippet file. Call .close() afterwards.
126         """
127         if not 'b' in mode:
128             mode += 'b'
129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
130         self.position = 0
131         return self
132
133     def add(self, snippet):
134         """
135         Append a snippet (unicode) to the snippet file.
136         Return a (position, length) tuple
137         """
138         txt = snippet.encode('utf-8')
139         l = len(txt)
140         self.file.write(txt)
141         pos = (self.position, l)
142         self.position += l
143         return pos
144
145     def get(self, pos):
146         """
147         Given a tuple of (position, length) return an unicode
148         of the snippet stored there.
149         """
150         self.file.seek(pos[0], 0)
151         txt = self.file.read(pos[1]).decode('utf-8')
152         return txt
153
154     def close(self):
155         """Close snippet file"""
156         self.file.close()
157
158
159 class BaseIndex(IndexStore):
160     """
161     Base index class.
162     Provides basic operations on index: opening, closing, optimizing.
163     """
164     def __init__(self, analyzer=None):
165         super(BaseIndex, self).__init__()
166         self.index = None
167         if not analyzer:
168             analyzer = WLAnalyzer()
169         self.analyzer = analyzer
170
171     def open(self, analyzer=None):
172         if self.index:
173             raise Exception("Index is already opened")
174         self.index = IndexWriter(self.store, self.analyzer,\
175                                  IndexWriter.MaxFieldLength.LIMITED)
176         return self.index
177
178     def optimize(self):
179         self.index.optimize()
180
181     def close(self):
182         try:
183             self.index.optimize()
184         except JavaError, je:
185             print "Error during optimize phase, check index: %s" % je
186
187         self.index.close()
188         self.index = None
189
190     def __enter__(self):
191         self.open()
192         return self
193
194     def __exit__(self, type, value, tb):
195         self.close()
196
197
198 class Index(BaseIndex):
199     """
200     Class indexing books.
201     """
202     def __init__(self, analyzer=None):
203         super(Index, self).__init__(analyzer)
204
205     def index_tags(self):
206         """
207         Re-index global tag list.
208         Removes all tags from index, then index them again.
209         Indexed fields include: id, name (with and without polish stems), category
210         """
211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
212         self.index.deleteDocuments(q)
213
214         for tag in catalogue.models.Tag.objects.all():
215             doc = Document()
216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
220             self.index.addDocument(doc)
221
222     def create_book_doc(self, book):
223         """
224         Create a lucene document referring book id.
225         """
226         doc = Document()
227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
228         if book.parent is not None:
229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
230         return doc
231
232     def remove_book(self, book):
233         """Removes a book from search index.
234         book - Book instance."""
235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
236         self.index.deleteDocuments(q)
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             self.remove_book(book)
246
247         book_doc = self.create_book_doc(book)
248         meta_fields = self.extract_metadata(book, book_info)
249         for f in meta_fields.values():
250             if isinstance(f, list) or isinstance(f, tuple):
251                 for elem in f:
252                     book_doc.add(elem)
253             else:
254                 book_doc.add(f)
255
256         self.index.addDocument(book_doc)
257         del book_doc
258
259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
260
261     master_tags = [
262         'opowiadanie',
263         'powiesc',
264         'dramat_wierszowany_l',
265         'dramat_wierszowany_lp',
266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267         'wywiad'
268         ]
269
270     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
271
272     def extract_metadata(self, book, book_info=None):
273         """
274         Extract metadata from book and returns a map of fields keyed by fieldname
275         """
276         fields = {}
277
278         if book_info is None:
279             book_info = dcparser.parse(open(book.xml_file.path))
280
281         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
282         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
283         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
284
285         # validator, name
286         for field in dcparser.BookInfo.FIELDS:
287             if hasattr(book_info, field.name):
288                 if not getattr(book_info, field.name):
289                     continue
290                 # since no type information is available, we use validator
291                 type_indicator = field.validator
292                 if type_indicator == dcparser.as_unicode:
293                     s = getattr(book_info, field.name)
294                     if field.multiple:
295                         s = ', '.join(s)
296                     try:
297                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
298                     except JavaError as je:
299                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
300                 elif type_indicator == dcparser.as_person:
301                     p = getattr(book_info, field.name)
302                     if isinstance(p, dcparser.Person):
303                         persons = unicode(p)
304                     else:
305                         persons = ', '.join(map(unicode, p))
306                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
307                 elif type_indicator == dcparser.as_date:
308                     dt = getattr(book_info, field.name)
309                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
310                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
311
312         return fields
313
314     def add_gaps(self, fields, fieldname):
315         """
316         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
317         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
318         """
319         def gap():
320             while True:
321                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
322         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
323
324     def get_master(self, root):
325         """
326         Returns the first master tag from an etree.
327         """
328         for master in root.iter():
329             if master.tag in self.master_tags:
330                 return master
331
332     def index_content(self, book, book_fields=[]):
333         """
334         Walks the book XML and extract content from it.
335         Adds parts for each header tag and for each fragment.
336         """
337         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
338         root = wld.edoc.getroot()
339
340         master = self.get_master(root)
341         if master is None:
342             return []
343
344         def walker(node):
345             yield node, None
346             for child in list(node):
347                 for b, e in walker(child):
348                     yield b, e
349             yield None, node
350             return
351
352         def fix_format(text):
353             return re.sub("(?m)/$", "", text)
354
355         def add_part(snippets, **fields):
356             doc = self.create_book_doc(book)
357             for f in book_fields:
358                 doc.add(f)
359
360             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
361             doc.add(NumericField("header_span", Field.Store.YES, True)\
362                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
363             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
364
365             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
366                           Field.TermVector.WITH_POSITIONS_OFFSETS))
367
368             snip_pos = snippets.add(fields["content"])
369             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
370             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
371
372             if 'fragment_anchor' in fields:
373                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
374                               Field.Store.YES, Field.Index.NOT_ANALYZED))
375
376             if 'themes' in fields:
377                 themes, themes_pl = zip(*[
378                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
379                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
380                      for theme in fields['themes']])
381
382                 themes = self.add_gaps(themes, 'themes')
383                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
384
385                 for t in themes:
386                     doc.add(t)
387                 for t in themes_pl:
388                     doc.add(t)
389
390             return doc
391
392         def give_me_utf8(s):
393             if isinstance(s, unicode):
394                 return s.encode('utf-8')
395             else:
396                 return s
397
398         fragments = {}
399         snippets = Snippets(book.id).open('w')
400         try:
401             for header, position in zip(list(master), range(len(master))):
402
403                 if header.tag in self.skip_header_tags:
404                     continue
405                 if header.tag is etree.Comment:
406                     continue
407
408                 # section content
409                 content = []
410
411                 for start, end in walker(header):
412                         # handle fragments and themes.
413                     if start is not None and start.tag == 'begin':
414                         fid = start.attrib['id'][1:]
415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
416
417                     elif start is not None and start.tag == 'motyw':
418                         fid = start.attrib['id'][1:]
419                         if start.text is not None:
420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
421
422                     elif start is not None and start.tag == 'end':
423                         fid = start.attrib['id'][1:]
424                         if fid not in fragments:
425                             continue  # a broken <end> node, skip it
426                                       #                        import pdb; pdb.set_trace()
427                         frag = fragments[fid]
428                         if frag['themes'] == []:
429                             continue  # empty themes list.
430                         del fragments[fid]
431
432                         def jstr(l):
433                             return u' '.join(map(
434                                 lambda x: x == None and u'(none)' or unicode(x),
435                                 l))
436
437                         doc = add_part(snippets,
438                                        header_type=frag['start_header'],
439                                        header_index=frag['start_section'],
440                                        header_span=position - frag['start_section'] + 1,
441                                        fragment_anchor=fid,
442                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
443                                        themes=frag['themes'])
444
445                         self.index.addDocument(doc)
446
447                         # Collect content.
448                     elif start is not None:
449                         for frag in fragments.values():
450                             frag['content'].append(start.text)
451                         content.append(start.text)
452                     elif end is not None:
453                         for frag in fragments.values():
454                             frag['content'].append(end.tail)
455                         content.append(end.tail)
456
457                         # in the end, add a section text.
458                 doc = add_part(snippets, header_index=position, header_type=header.tag,
459                                content=fix_format(u' '.join(filter(lambda s: s is not None, content))))
460
461                 self.index.addDocument(doc)
462
463         finally:
464             snippets.close()
465
466
467 def log_exception_wrapper(f):
468     def _wrap(*a):
469         try:
470             f(*a)
471         except Exception, e:
472             print("Error in indexing thread: %s" % e)
473             traceback.print_exc()
474             raise e
475     return _wrap
476
477
478 class ReusableIndex(Index):
479     """
480     Works like index, but does not close/optimize Lucene index
481     until program exit (uses atexit hook).
482     This is usefull for importbooks command.
483
484     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
485     """
486     index = None
487
488     def open(self, analyzer=None, threads=4):
489         if ReusableIndex.index is not None:
490             self.index = ReusableIndex.index
491         else:
492             print("opening index")
493             Index.open(self, analyzer)
494             ReusableIndex.index = self.index
495             atexit.register(ReusableIndex.close_reusable)
496
497     # def index_book(self, *args, **kw):
498     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
499     #     ReusableIndex.pool_jobs.append(job)
500
501     @staticmethod
502     def close_reusable():
503         if ReusableIndex.index is not None:
504             ReusableIndex.index.optimize()
505             ReusableIndex.index.close()
506             ReusableIndex.index = None
507
508     def close(self):
509         pass
510
511
512 class JoinSearch(object):
513     """
514     This mixin could be used to handle block join queries.
515     (currently unused)
516     """
517     def __init__(self, *args, **kw):
518         super(JoinSearch, self).__init__(*args, **kw)
519
520     def wrapjoins(self, query, fields=[]):
521         """
522         This functions modifies the query in a recursive way,
523         so Term and Phrase Queries contained, which match
524         provided fields are wrapped in a BlockJoinQuery,
525         and so delegated to children documents.
526         """
527         if BooleanQuery.instance_(query):
528             qs = BooleanQuery.cast_(query)
529             for clause in qs:
530                 clause = BooleanClause.cast_(clause)
531                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
532             return qs
533         else:
534             termset = HashSet()
535             query.extractTerms(termset)
536             for t in termset:
537                 t = Term.cast_(t)
538                 if t.field() not in fields:
539                     return query
540             return BlockJoinQuery(query, self.parent_filter,
541                                   BlockJoinQuery.ScoreMode.Total)
542
543     def bsearch(self, query, max_results=50):
544         q = self.query(query)
545         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
546
547         tops = self.searcher.search(bjq, max_results)
548         bks = []
549         for found in tops.scoreDocs:
550             doc = self.searcher.doc(found.doc)
551             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
552         return (bks, tops.totalHits)
553
554
555 class SearchResult(object):
556     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
557         if tokens_cache is None: tokens_cache = {}
558             
559         if score:
560             self._score = score
561         else:
562             self._score = scoreDocs.score
563             
564         self.boost = 1.0
565
566         self._hits = []
567         self.hits = None  # processed hits
568
569         stored = searcher.doc(scoreDocs.doc)
570         self.book_id = int(stored.get("book_id"))
571
572         header_type = stored.get("header_type")
573         if not header_type:
574             return
575
576         sec = (header_type, int(stored.get("header_index")))
577         header_span = stored.get('header_span')
578         header_span = header_span is not None and int(header_span) or 1
579
580         fragment = stored.get("fragment_anchor")
581
582         if snippets:
583             snippets = snippets.replace("/\n", "\n")
584         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
585
586         self._hits.append(hit)
587
588         self.searcher = searcher
589         self.searched = searched
590         self.tokens_cache = tokens_cache
591
592     @property
593     def score(self):
594         return self._score * self.boost
595
596     def merge(self, other):
597         if self.book_id != other.book_id:
598             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
599         self._hits += other._hits
600         if other.score > self.score:
601             self.score = other.score
602         return self
603
604     def get_book(self):
605         return catalogue.models.Book.objects.get(id=self.book_id)
606
607     book = property(get_book)
608
609     def process_hits(self):
610         POSITION = 0
611         FRAGMENT = 1
612         POSITION_INDEX = 1
613         POSITION_SPAN = 2
614         SCORE = 2
615         OTHER = 3
616
617         # to sections and fragments
618         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
619         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
620         sect = filter(lambda s: 0 == len(filter(
621             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
622             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
623             frags)), sect)
624
625         hits = []
626
627         # remove duplicate fragments
628         fragments = {}
629         for f in frags:
630             fid = f[FRAGMENT]
631             if fid in fragments:
632                 if fragments[fid][SCORE] >= f[SCORE]:
633                     continue
634             fragments[fid] = f
635         frags = fragments.values()
636
637         # remove duplicate sections
638         sections = {}
639
640         for s in sect:
641             si = s[POSITION][POSITION_INDEX]
642             # skip existing
643             if si in sections:
644                 if sections[si]['score'] >= s[SCORE]:
645                     continue
646
647             m = {'score': s[SCORE],
648                  'section_number': s[POSITION][POSITION_INDEX] + 1,
649                  }
650             m.update(s[OTHER])
651             sections[si] = m
652
653         hits = sections.values()
654
655         for f in frags:
656             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
657
658             # Figure out if we were searching for a token matching some word in theme name.
659             themes = frag.tags.filter(category='theme')
660             themes_hit = []
661             if self.searched is not None:
662                 tokens = self.searcher.get_tokens(self.searched, 'POLISH', tokens_cache=self.tokens_cache)
663                 for theme in themes:
664                     name_tokens = self.searcher.get_tokens(theme.name, 'POLISH')
665                     for t in tokens:
666                         if name_tokens.index(t):
667                             if not theme in themes_hit:
668                                 themes_hit.append(theme)
669                             break
670
671             m = {'score': f[SCORE],
672                  'fragment': frag,
673                  'section_number': f[POSITION][POSITION_INDEX] + 1,
674                  'themes': themes,
675                  'themes_hit': themes_hit
676                  }
677             m.update(f[OTHER])
678             hits.append(m)
679
680         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
681
682         self.hits = hits
683
684         return self
685
686     def __unicode__(self):
687         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
688
689     @staticmethod
690     def aggregate(*result_lists):
691         books = {}
692         for rl in result_lists:
693             for r in rl:
694                 if r.book_id in books:
695                     books[r.book_id].merge(r)
696                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
697                 else:
698                     books[r.book_id] = r
699         return books.values()
700
701     def __cmp__(self, other):
702         return cmp(self.score, other.score)
703
704
705 class Hint(object):
706     """
707     Given some hint information (information we already know about)
708     our search target - like author, title (specific book), epoch, genre, kind
709     we can narrow down search using filters.
710     """
711     def __init__(self, search):
712         """
713         Accepts a Searcher instance.
714         """
715         self.search = search
716         self.book_tags = {}
717         self.part_tags = []
718         self._books = []
719
720     def books(self, *books):
721         """
722         Give a hint that we search these books.
723         """
724         self._books = books
725
726     def tags(self, tags):
727         """
728         Give a hint that these Tag objects (a list of)
729         is necessary.
730         """
731         for t in tags:
732             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
733                 lst = self.book_tags.get(t.category, [])
734                 lst.append(t)
735                 self.book_tags[t.category] = lst
736             if t.category in ['theme', 'theme_pl']:
737                 self.part_tags.append(t)
738
739     def tag_filter(self, tags, field='tags'):
740         """
741         Given a lsit of tags and an optional field (but they are normally in tags field)
742         returns a filter accepting only books with specific tags.
743         """
744         q = BooleanQuery()
745
746         for tag in tags:
747             toks = self.search.get_tokens(tag.name, field=field)
748             tag_phrase = PhraseQuery()
749             for tok in toks:
750                 tag_phrase.add(Term(field, tok))
751             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
752
753         return QueryWrapperFilter(q)
754
755     def book_filter(self):
756         """
757         Filters using book tags (all tag kinds except a theme)
758         """
759         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
760         if tags:
761             return self.tag_filter(tags)
762         else:
763             return None
764
765     def part_filter(self):
766         """
767         This filter can be used to look for book parts.
768         It filters on book id and/or themes.
769         """
770         fs = []
771         if self.part_tags:
772             fs.append(self.tag_filter(self.part_tags, field='themes'))
773
774         if self._books != []:
775             bf = BooleanFilter()
776             for b in self._books:
777                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
778                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
779             fs.append(bf)
780
781         return Search.chain_filters(fs)
782
783     def should_search_for_book(self):
784         return self._books == []
785
786     def just_search_in(self, all):
787         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
788         some = []
789         for field in all:
790             if field == 'authors' and 'author' in self.book_tags:
791                 continue
792             if field == 'title' and self._books != []:
793                 continue
794             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
795                 continue
796             some.append(field)
797         return some
798
799
800 class Search(IndexStore):
801     """
802     Search facilities.
803     """
804     def __init__(self, default_field="content"):
805         IndexStore.__init__(self)
806         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
807         # self.analyzer = WLAnalyzer()
808         self.searcher = IndexSearcher(self.store, True)
809         self.parser = QueryParser(Version.LUCENE_34, default_field,
810                                   self.analyzer)
811
812         self.parent_filter = TermsFilter()
813         self.parent_filter.addTerm(Term("is_book", "true"))
814
815     def query(self, query):
816         """Parse query in default Lucene Syntax. (for humans)
817         """
818         return self.parser.parse(query)
819
820     def simple_search(self, query, max_results=50):
821         """Runs a query for books using lucene syntax. (for humans)
822         Returns (books, total_hits)
823         """
824
825         tops = self.searcher.search(self.query(query), max_results)
826         bks = []
827         for found in tops.scoreDocs:
828             doc = self.searcher.doc(found.doc)
829             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
830         return (bks, tops.totalHits)
831
832     def get_tokens(self, searched, field='content', cached=None):
833         """returns tokens analyzed by a proper (for a field) analyzer
834         argument can be: StringReader, string/unicode, or tokens. In the last case
835         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
836         """
837         if cached is not None and field in cached:
838             return cached[field]
839
840         if isinstance(searched, str) or isinstance(searched, unicode):
841             searched = StringReader(searched)
842         elif isinstance(searched, list):
843             return searched
844
845         searched.reset()
846         tokens = self.analyzer.reusableTokenStream(field, searched)
847         toks = []
848         while tokens.incrementToken():
849             cta = tokens.getAttribute(CharTermAttribute.class_)
850             toks.append(cta.toString())
851
852         if cached is not None:
853             cached[field] = toks
854
855         return toks
856
857     def fuzziness(self, fuzzy):
858         """Helper method to sanitize fuzziness"""
859         if not fuzzy:
860             return None
861         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
862             return fuzzy
863         else:
864             return 0.5
865
866     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
867         """
868         Return a PhraseQuery with a series of tokens.
869         """
870         if fuzzy:
871             phrase = MultiPhraseQuery()
872             for t in tokens:
873                 term = Term(field, t)
874                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
875                 fuzzterms = []
876
877                 while True:
878                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
879                     ft = fuzzterm.term()
880                     if ft:
881                         fuzzterms.append(ft)
882                     if not fuzzterm.next(): break
883                 if fuzzterms:
884                     phrase.add(JArray('object')(fuzzterms, Term))
885                 else:
886                     phrase.add(term)
887         else:
888             phrase = PhraseQuery()
889             phrase.setSlop(slop)
890             for t in tokens:
891                 term = Term(field, t)
892                 phrase.add(term)
893         return phrase
894
895     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
896         """
897         Returns term queries joined by boolean query.
898         modal - applies to boolean query
899         fuzzy - should the query by fuzzy.
900         """
901         q = BooleanQuery()
902         for t in tokens:
903             term = Term(field, t)
904             if fuzzy:
905                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
906             else:
907                 term = TermQuery(term)
908             q.add(BooleanClause(term, modal))
909         return q
910
911     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
912                       filters=None, tokens_cache=None, boost=None):
913         if filters is None: filters = []
914         if tokens_cache is None: tokens_cache = {}
915
916         tokens = self.get_tokens(searched, field, cached=tokens_cache)
917
918         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy)
919         if book:
920             filters.append(self.term_filter(Term('is_book', 'true')))
921         top = self.searcher.search(query, self.chain_filters(filters), max_results)
922
923         return [SearchResult(self.searcher, found) for found in top.scoreDocs]
924
925     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
926                     filters=None, tokens_cache=None, boost=None):
927         if filters is None: filters = []
928         if tokens_cache is None: tokens_cache = {}
929
930         if book:
931             filters.append(self.term_filter(Term('is_book', 'true')))
932
933         query = BooleanQuery()
934
935         for fld in fields:
936             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
937
938             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
939                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
940
941         top = self.searcher.search(query, self.chain_filters(filters), max_results)
942
943         return [SearchResult(self.searcher, found, searched=searched, tokens_cache=tokens_cache) for found in top.scoreDocs]
944
945     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
946         """
947         Search for perfect book matches. Just see if the query matches with some author or title,
948         taking hints into account.
949         """
950         fields_to_search = ['authors', 'title']
951         only_in = None
952         if hint:
953             if not hint.should_search_for_book():
954                 return []
955             fields_to_search = hint.just_search_in(fields_to_search)
956             only_in = hint.book_filter()
957
958         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
959
960         books = []
961         for q in qrys:
962             top = self.searcher.search(q,
963                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
964                 max_results)
965             for found in top.scoreDocs:
966                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
967         return books
968
969     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
970         fields_to_search = ['tags', 'authors', 'title']
971
972         only_in = None
973         if hint:
974             if not hint.should_search_for_book():
975                 return []
976             fields_to_search = hint.just_search_in(fields_to_search)
977             only_in = hint.book_filter()
978
979         tokens = self.get_tokens(searched, field='SIMPLE')
980
981         q = BooleanQuery()
982
983         for fld in fields_to_search:
984             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
985                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
986
987         books = []
988         top = self.searcher.search(q,
989                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
990             max_results)
991         for found in top.scoreDocs:
992             books.append(SearchResult(self.searcher, found, how_found="search_book"))
993
994         return books
995
996     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
997         """
998         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
999         some part/fragment of the book.
1000         """
1001         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1002
1003         flt = None
1004         if hint:
1005             flt = hint.part_filter()
1006
1007         books = []
1008         for q in qrys:
1009             top = self.searcher.search(q,
1010                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1011                                                            flt]),
1012                                        max_results)
1013             for found in top.scoreDocs:
1014                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1015
1016         return books
1017
1018     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1019         """
1020         Tries to use search terms to match different fields of book (or its parts).
1021         E.g. one word can be an author survey, another be a part of the title, and the rest
1022         are some words from third chapter.
1023         """
1024         if tokens_cache is None: tokens_cache = {}
1025         books = []
1026         only_in = None
1027
1028         if hint:
1029             only_in = hint.part_filter()
1030
1031         # content only query : themes x content
1032         q = BooleanQuery()
1033
1034         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1035         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1036
1037         # only search in themes when we do not already filter by themes
1038         if hint is None or hint.just_search_in(['themes']) != []:
1039             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1040                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1041
1042         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1043                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1044
1045         topDocs = self.searcher.search(q, only_in, max_results)
1046         for found in topDocs.scoreDocs:
1047             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
1048             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1049
1050         # query themes/content x author/title/tags
1051         q = BooleanQuery()
1052         in_content = BooleanQuery()
1053         in_meta = BooleanQuery()
1054
1055         for fld in ['themes_pl', 'content']:
1056             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1057
1058         for fld in ['tags', 'authors', 'title']:
1059             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1060
1061         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1062         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1063
1064         topDocs = self.searcher.search(q, only_in, max_results)
1065         for found in topDocs.scoreDocs:
1066             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
1067             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1068
1069         return books
1070
1071     # def multisearch(self, query, max_results=50):
1072     #     """
1073     #     Search strategy:
1074     #     - (phrase) OR -> content
1075     #                   -> title
1076     #                   -> authors
1077     #     - (keywords)  -> authors
1078     #                   -> motyw
1079     #                   -> tags
1080     #                   -> content
1081     #     """
1082         # queryreader = StringReader(query)
1083         # tokens = self.get_tokens(queryreader)
1084
1085         # top_level = BooleanQuery()
1086         # Should = BooleanClause.Occur.SHOULD
1087
1088         # phrase_level = BooleanQuery()
1089         # phrase_level.setBoost(1.3)
1090
1091         # p_content = self.make_phrase(tokens, joined=True)
1092         # p_title = self.make_phrase(tokens, 'title')
1093         # p_author = self.make_phrase(tokens, 'author')
1094
1095         # phrase_level.add(BooleanClause(p_content, Should))
1096         # phrase_level.add(BooleanClause(p_title, Should))
1097         # phrase_level.add(BooleanClause(p_author, Should))
1098
1099         # kw_level = BooleanQuery()
1100
1101         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1102         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1103         # kw_level.add(j_themes, Should)
1104         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1105         # j_con = self.make_term_query(tokens, joined=True)
1106         # kw_level.add(j_con, Should)
1107
1108         # top_level.add(BooleanClause(phrase_level, Should))
1109         # top_level.add(BooleanClause(kw_level, Should))
1110
1111         # return None
1112
1113     def get_snippets(self, scoreDoc, query, field='content'):
1114         """
1115         Returns a snippet for found scoreDoc.
1116         """
1117         htmlFormatter = SimpleHTMLFormatter()
1118         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1119
1120         stored = self.searcher.doc(scoreDoc.doc)
1121
1122         # locate content.
1123         snippets = Snippets(stored.get('book_id')).open()
1124         try:
1125             text = snippets.get((int(stored.get('snippets_position')),
1126                                  int(stored.get('snippets_length'))))
1127         finally:
1128             snippets.close()
1129
1130         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1131         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1132         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1133
1134         return snip
1135
1136     @staticmethod
1137     def enum_to_array(enum):
1138         """
1139         Converts a lucene TermEnum to array of Terms, suitable for
1140         addition to queries
1141         """
1142         terms = []
1143
1144         while True:
1145             t = enum.term()
1146             if t:
1147                 terms.append(t)
1148             if not enum.next(): break
1149
1150         if terms:
1151             return JArray('object')(terms, Term)
1152
1153     def search_tags(self, query, filter=None, max_results=40):
1154         """
1155         Search for Tag objects using query.
1156         """
1157         tops = self.searcher.search(query, filter, max_results)
1158
1159         tags = []
1160         for found in tops.scoreDocs:
1161             doc = self.searcher.doc(found.doc)
1162             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1163             tags.append(tag)
1164             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1165
1166         return tags
1167
1168     def search_books(self, query, filter=None, max_results=10):
1169         """
1170         Searches for Book objects using query
1171         """
1172         bks = []
1173         tops = self.searcher.search(query, filter, max_results)
1174         for found in tops.scoreDocs:
1175             doc = self.searcher.doc(found.doc)
1176             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1177         return bks
1178
1179     def create_prefix_phrase(self, toks, field):
1180         q = MultiPhraseQuery()
1181         for i in range(len(toks)):
1182             t = Term(field, toks[i])
1183             if i == len(toks) - 1:
1184                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1185                 if pterms:
1186                     q.add(pterms)
1187                 else:
1188                     q.add(t)
1189             else:
1190                 q.add(t)
1191         return q
1192
1193     @staticmethod
1194     def term_filter(term, inverse=False):
1195         only_term = TermsFilter()
1196         only_term.addTerm(term)
1197
1198         if inverse:
1199             neg = BooleanFilter()
1200             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1201             only_term = neg
1202
1203         return only_term
1204
1205     def hint_tags(self, string, max_results=50):
1206         """
1207         Return auto-complete hints for tags
1208         using prefix search.
1209         """
1210         toks = self.get_tokens(string, field='SIMPLE')
1211         top = BooleanQuery()
1212
1213         for field in ['tag_name', 'tag_name_pl']:
1214             q = self.create_prefix_phrase(toks, field)
1215             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1216
1217         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1218
1219         return self.search_tags(top, no_book_cat, max_results=max_results)
1220
1221     def hint_books(self, string, max_results=50):
1222         """
1223         Returns auto-complete hints for book titles
1224         Because we do not index 'pseudo' title-tags.
1225         Prefix search.
1226         """
1227         toks = self.get_tokens(string, field='SIMPLE')
1228
1229         q = self.create_prefix_phrase(toks, 'title')
1230
1231         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1232
1233     @staticmethod
1234     def chain_filters(filters, op=ChainedFilter.AND):
1235         """
1236         Chains a filter list together
1237         """
1238         filters = filter(lambda x: x is not None, filters)
1239         if not filters or filters is []:
1240             return None
1241         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1242         return chf
1243
1244     def filtered_categories(self, tags):
1245         """
1246         Return a list of tag categories, present in tags list.
1247         """
1248         cats = {}
1249         for t in tags:
1250             cats[t.category] = True
1251         return cats.keys()
1252
1253     def hint(self):
1254         return Hint(self)