e59c38b17abea7fff2c338d5135eb1c7505701d3
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("authors", simple)
57         self.addAnalyzer("title", simple)
58
59         self.addAnalyzer("is_book", keyword)
60         # shouldn't the title have two forms? _pl and simple?
61
62         self.addAnalyzer("themes", simple)
63         self.addAnalyzer("themes_pl", polish)
64
65         self.addAnalyzer("tag_name", simple)
66         self.addAnalyzer("tag_name_pl", polish)
67
68         self.addAnalyzer("translators", simple)
69
70         self.addAnalyzer("KEYWORD", keyword)
71         self.addAnalyzer("SIMPLE", simple)
72         self.addAnalyzer("POLISH", polish)
73
74
75 class IndexStore(object):
76     """
77     Provides access to search index.
78
79     self.store - lucene index directory
80     """
81     def __init__(self):
82         self.make_index_dir()
83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
84
85     def make_index_dir(self):
86         try:
87             os.makedirs(settings.SEARCH_INDEX)
88         except OSError as exc:
89             if exc.errno == errno.EEXIST:
90                 pass
91             else: raise
92
93
94 class IndexChecker(IndexStore):
95     def __init__(self):
96         IndexStore.__init__(self)
97
98     def check(self):
99         checker = CheckIndex(self.store)
100         status = checker.checkIndex()
101         return status
102
103
104 class Snippets(object):
105     """
106     This class manages snippet files for indexed object (book)
107     the snippets are concatenated together, and their positions and
108     lengths are kept in lucene index fields.
109     """
110     SNIPPET_DIR = "snippets"
111
112     def __init__(self, book_id):
113         try:
114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
115         except OSError as exc:
116             if exc.errno == errno.EEXIST:
117                 pass
118             else: raise
119         self.book_id = book_id
120         self.file = None
121
122     def open(self, mode='r'):
123         """
124         Open the snippet file. Call .close() afterwards.
125         """
126         if not 'b' in mode:
127             mode += 'b'
128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
129         self.position = 0
130         return self
131
132     def add(self, snippet):
133         """
134         Append a snippet (unicode) to the snippet file.
135         Return a (position, length) tuple
136         """
137         txt = snippet.encode('utf-8')
138         l = len(txt)
139         self.file.write(txt)
140         pos = (self.position, l)
141         self.position += l
142         return pos
143
144     def get(self, pos):
145         """
146         Given a tuple of (position, length) return an unicode
147         of the snippet stored there.
148         """
149         self.file.seek(pos[0], 0)
150         txt = self.file.read(pos[1]).decode('utf-8')
151         return txt
152
153     def close(self):
154         """Close snippet file"""
155         self.file.close()
156
157
158 class BaseIndex(IndexStore):
159     """
160     Base index class.
161     Provides basic operations on index: opening, closing, optimizing.
162     """
163     def __init__(self, analyzer=None):
164         super(BaseIndex, self).__init__()
165         self.index = None
166         if not analyzer:
167             analyzer = WLAnalyzer()
168         self.analyzer = analyzer
169
170     def open(self, analyzer=None):
171         if self.index:
172             raise Exception("Index is already opened")
173         self.index = IndexWriter(self.store, self.analyzer,\
174                                  IndexWriter.MaxFieldLength.LIMITED)
175         return self.index
176
177     def optimize(self):
178         self.index.optimize()
179
180     def close(self):
181         try:
182             self.index.optimize()
183         except JavaError, je:
184             print "Error during optimize phase, check index: %s" % je
185
186         self.index.close()
187         self.index = None
188
189     def __enter__(self):
190         self.open()
191         return self
192
193     def __exit__(self, type, value, tb):
194         self.close()
195
196
197 class Index(BaseIndex):
198     """
199     Class indexing books.
200     """
201     def __init__(self, analyzer=None):
202         super(Index, self).__init__(analyzer)
203
204     def index_tags(self):
205         """
206         Re-index global tag list.
207         Removes all tags from index, then index them again.
208         Indexed fields include: id, name (with and without polish stems), category
209         """
210         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
211         self.index.deleteDocuments(q)
212
213         for tag in catalogue.models.Tag.objects.all():
214             doc = Document()
215             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
216             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
217             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
219             self.index.addDocument(doc)
220
221     def create_book_doc(self, book):
222         """
223         Create a lucene document referring book id.
224         """
225         doc = Document()
226         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
227         if book.parent is not None:
228             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
229         return doc
230
231     def remove_book(self, book):
232         """Removes a book from search index.
233         book - Book instance."""
234         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
235         self.index.deleteDocuments(q)
236
237     def index_book(self, book, book_info=None, overwrite=True):
238         """
239         Indexes the book.
240         Creates a lucene document for extracted metadata
241         and calls self.index_content() to index the contents of the book.
242         """
243         if overwrite:
244             self.remove_book(book)
245
246         book_doc = self.create_book_doc(book)
247         meta_fields = self.extract_metadata(book, book_info)
248         for f in meta_fields.values():
249             if isinstance(f, list) or isinstance(f, tuple):
250                 for elem in f:
251                     book_doc.add(elem)
252             else:
253                 book_doc.add(f)
254
255         self.index.addDocument(book_doc)
256         del book_doc
257
258         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
259
260     master_tags = [
261         'opowiadanie',
262         'powiesc',
263         'dramat_wierszowany_l',
264         'dramat_wierszowany_lp',
265         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
266         'wywiad'
267         ]
268
269     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
270
271     def extract_metadata(self, book, book_info=None):
272         """
273         Extract metadata from book and returns a map of fields keyed by fieldname
274         """
275         fields = {}
276
277         if book_info is None:
278             book_info = dcparser.parse(open(book.xml_file.path))
279
280         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
281         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
282         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
283
284         # validator, name
285         for field in dcparser.BookInfo.FIELDS:
286             if hasattr(book_info, field.name):
287                 if not getattr(book_info, field.name):
288                     continue
289                 # since no type information is available, we use validator
290                 type_indicator = field.validator
291                 if type_indicator == dcparser.as_unicode:
292                     s = getattr(book_info, field.name)
293                     if field.multiple:
294                         s = ', '.join(s)
295                     try:
296                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
297                     except JavaError as je:
298                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
299                 elif type_indicator == dcparser.as_person:
300                     p = getattr(book_info, field.name)
301                     if isinstance(p, dcparser.Person):
302                         persons = unicode(p)
303                     else:
304                         persons = ', '.join(map(unicode, p))
305                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
306                 elif type_indicator == dcparser.as_date:
307                     dt = getattr(book_info, field.name)
308                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
309                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
310
311         return fields
312
313     def add_gaps(self, fields, fieldname):
314         """
315         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
316         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
317         """
318         def gap():
319             while True:
320                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
321         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
322
323     def get_master(self, root):
324         """
325         Returns the first master tag from an etree.
326         """
327         for master in root.iter():
328             if master.tag in self.master_tags:
329                 return master
330
331     def index_content(self, book, book_fields=[]):
332         """
333         Walks the book XML and extract content from it.
334         Adds parts for each header tag and for each fragment.
335         """
336         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
337         root = wld.edoc.getroot()
338
339         master = self.get_master(root)
340         if master is None:
341             return []
342
343         def walker(node):
344             yield node, None
345             for child in list(node):
346                 for b, e in walker(child):
347                     yield b, e
348             yield None, node
349             return
350
351         def fix_format(text):
352             return re.sub("(?m)/$", "", text)
353
354         def add_part(snippets, **fields):
355             doc = self.create_book_doc(book)
356             for f in book_fields:
357                 doc.add(f)
358
359             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
360             doc.add(NumericField("header_span", Field.Store.YES, True)\
361                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
362             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
363
364             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
365                           Field.TermVector.WITH_POSITIONS_OFFSETS))
366
367             snip_pos = snippets.add(fields["content"])
368             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
369             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
370
371             if 'fragment_anchor' in fields:
372                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
373                               Field.Store.YES, Field.Index.NOT_ANALYZED))
374
375             if 'themes' in fields:
376                 themes, themes_pl = zip(*[
377                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
378                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
379                      for theme in fields['themes']])
380
381                 themes = self.add_gaps(themes, 'themes')
382                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
383
384                 for t in themes:
385                     doc.add(t)
386                 for t in themes_pl:
387                     doc.add(t)
388
389             return doc
390
391         def give_me_utf8(s):
392             if isinstance(s, unicode):
393                 return s.encode('utf-8')
394             else:
395                 return s
396
397         fragments = {}
398         snippets = Snippets(book.id).open('w')
399         try:
400             for header, position in zip(list(master), range(len(master))):
401
402                 if header.tag in self.skip_header_tags:
403                     continue
404
405                 content = u' '.join([t for t in header.itertext()])
406                 content = fix_format(content)
407
408                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
409
410                 self.index.addDocument(doc)
411                 
412                 for start, end in walker(header):
413                     if start is not None and start.tag == 'begin':
414                         fid = start.attrib['id'][1:]
415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
416                         fragments[fid]['content'].append(start.tail)
417                     elif start is not None and start.tag == 'motyw':
418                         fid = start.attrib['id'][1:]
419                         if start.text is not None:
420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
421                         fragments[fid]['content'].append(start.tail)
422                     elif start is not None and start.tag == 'end':
423                         fid = start.attrib['id'][1:]
424                         if fid not in fragments:
425                             continue  # a broken <end> node, skip it
426                         frag = fragments[fid]
427                         if frag['themes'] == []:
428                             continue  # empty themes list.
429                         del fragments[fid]
430
431                         def jstr(l):
432                             return u' '.join(map(
433                                 lambda x: x == None and u'(none)' or unicode(x),
434                                 l))
435
436                         doc = add_part(snippets,
437                                        header_type=frag['start_header'],
438                                        header_index=frag['start_section'],
439                                        header_span=position - frag['start_section'] + 1,
440                                        fragment_anchor=fid,
441                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
442                                        themes=frag['themes'])
443
444                         self.index.addDocument(doc)
445                     elif start is not None:
446                         for frag in fragments.values():
447                             frag['content'].append(start.text)
448                     elif end is not None:
449                         for frag in fragments.values():
450                             frag['content'].append(end.tail)
451         finally:
452             snippets.close()
453
454
455 def log_exception_wrapper(f):
456     def _wrap(*a):
457         try:
458             f(*a)
459         except Exception, e:
460             print("Error in indexing thread: %s" % e)
461             traceback.print_exc()
462             raise e
463     return _wrap
464
465
466 class ReusableIndex(Index):
467     """
468     Works like index, but does not close/optimize Lucene index
469     until program exit (uses atexit hook).
470     This is usefull for importbooks command.
471
472     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
473     """
474     index = None
475
476     def open(self, analyzer=None, threads=4):
477         if ReusableIndex.index is not None:
478             self.index = ReusableIndex.index
479         else:
480             print("opening index")
481             Index.open(self, analyzer)
482             ReusableIndex.index = self.index
483             atexit.register(ReusableIndex.close_reusable)
484
485     # def index_book(self, *args, **kw):
486     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
487     #     ReusableIndex.pool_jobs.append(job)
488
489     @staticmethod
490     def close_reusable():
491         if ReusableIndex.index is not None:
492             ReusableIndex.index.optimize()
493             ReusableIndex.index.close()
494             ReusableIndex.index = None
495
496     def close(self):
497         pass
498
499
500 class JoinSearch(object):
501     """
502     This mixin could be used to handle block join queries.
503     (currently unused)
504     """
505     def __init__(self, *args, **kw):
506         super(JoinSearch, self).__init__(*args, **kw)
507
508     def wrapjoins(self, query, fields=[]):
509         """
510         This functions modifies the query in a recursive way,
511         so Term and Phrase Queries contained, which match
512         provided fields are wrapped in a BlockJoinQuery,
513         and so delegated to children documents.
514         """
515         if BooleanQuery.instance_(query):
516             qs = BooleanQuery.cast_(query)
517             for clause in qs:
518                 clause = BooleanClause.cast_(clause)
519                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
520             return qs
521         else:
522             termset = HashSet()
523             query.extractTerms(termset)
524             for t in termset:
525                 t = Term.cast_(t)
526                 if t.field() not in fields:
527                     return query
528             return BlockJoinQuery(query, self.parent_filter,
529                                   BlockJoinQuery.ScoreMode.Total)
530
531     def bsearch(self, query, max_results=50):
532         q = self.query(query)
533         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
534
535         tops = self.searcher.search(bjq, max_results)
536         bks = []
537         for found in tops.scoreDocs:
538             doc = self.searcher.doc(found.doc)
539             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
540         return (bks, tops.totalHits)
541
542
543 class SearchResult(object):
544     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
545         if score:
546             self.score = score
547         else:
548             self.score = scoreDocs.score
549
550         self._hits = []
551         self.hits = None  # processed hits
552
553         stored = searcher.doc(scoreDocs.doc)
554         self.book_id = int(stored.get("book_id"))
555
556         header_type = stored.get("header_type")
557         if not header_type:
558             return
559
560         sec = (header_type, int(stored.get("header_index")))
561         header_span = stored.get('header_span')
562         header_span = header_span is not None and int(header_span) or 1
563
564         fragment = stored.get("fragment_anchor")
565
566         if snippets:
567             snippets = snippets.replace("/\n", "\n")
568         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
569
570         self._hits.append(hit)
571
572     def merge(self, other):
573         if self.book_id != other.book_id:
574             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
575         self._hits += other._hits
576         if other.score > self.score:
577             self.score = other.score
578         return self
579
580     def get_book(self):
581         return catalogue.models.Book.objects.get(id=self.book_id)
582
583     book = property(get_book)
584
585     def process_hits(self):
586         POSITION = 0
587         FRAGMENT = 1
588         POSITION_INDEX = 1
589         POSITION_SPAN = 2
590         SCORE = 2
591         OTHER = 3
592
593         # to sections and fragments
594         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
595         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
596         sect = filter(lambda s: 0 == len(filter(
597             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
598             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
599             frags)), sect)
600
601         hits = []
602
603         # remove duplicate fragments
604         fragments = {}
605         for f in frags:
606             fid = f[FRAGMENT]
607             if fid in fragments:
608                 if fragments[fid][SCORE] >= f[SCORE]:
609                     continue
610             fragments[fid] = f
611         frags = fragments.values()
612
613         # remove duplicate sections
614         sections = {}
615
616         for s in sect:
617             si = s[POSITION][POSITION_INDEX]
618             # skip existing
619             if si in sections:
620                 if sections[si]['score'] >= s[SCORE]:
621                     continue
622
623             m = {'score': s[SCORE],
624                  'section_number': s[POSITION][POSITION_INDEX] + 1,
625                  }
626             m.update(s[OTHER])
627             sections[si] = m
628
629         hits = sections.values()
630
631         for f in frags:
632             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
633             m = {'score': f[SCORE],
634                  'fragment': frag,
635                  'section_number': f[POSITION][POSITION_INDEX] + 1,
636                  'themes': frag.tags.filter(category='theme')
637                  }
638             m.update(f[OTHER])
639             hits.append(m)
640
641         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
642
643         self.hits = hits
644
645         return self
646
647     def __unicode__(self):
648         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
649
650     @staticmethod
651     def aggregate(*result_lists):
652         books = {}
653         for rl in result_lists:
654             for r in rl:
655                 if r.book_id in books:
656                     books[r.book_id].merge(r)
657                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
658                 else:
659                     books[r.book_id] = r
660         return books.values()
661
662     def __cmp__(self, other):
663         return cmp(self.score, other.score)
664
665
666 class Hint(object):
667     """
668     Given some hint information (information we already know about)
669     our search target - like author, title (specific book), epoch, genre, kind
670     we can narrow down search using filters.
671     """
672     def __init__(self, search):
673         """
674         Accepts a Searcher instance.
675         """
676         self.search = search
677         self.book_tags = {}
678         self.part_tags = []
679         self._books = []
680
681     def books(self, *books):
682         """
683         Give a hint that we search these books.
684         """
685         self._books = books
686
687     def tags(self, tags):
688         """
689         Give a hint that these Tag objects (a list of)
690         is necessary.
691         """
692         for t in tags:
693             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
694                 lst = self.book_tags.get(t.category, [])
695                 lst.append(t)
696                 self.book_tags[t.category] = lst
697             if t.category in ['theme', 'theme_pl']:
698                 self.part_tags.append(t)
699
700     def tag_filter(self, tags, field='tags'):
701         """
702         Given a lsit of tags and an optional field (but they are normally in tags field)
703         returns a filter accepting only books with specific tags.
704         """
705         q = BooleanQuery()
706
707         for tag in tags:
708             toks = self.search.get_tokens(tag.name, field=field)
709             tag_phrase = PhraseQuery()
710             for tok in toks:
711                 tag_phrase.add(Term(field, tok))
712             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
713
714         return QueryWrapperFilter(q)
715
716     def book_filter(self):
717         """
718         Filters using book tags (all tag kinds except a theme)
719         """
720         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
721         if tags:
722             return self.tag_filter(tags)
723         else:
724             return None
725
726     def part_filter(self):
727         """
728         This filter can be used to look for book parts.
729         It filters on book id and/or themes.
730         """
731         fs = []
732         if self.part_tags:
733             fs.append(self.tag_filter(self.part_tags, field='themes'))
734
735         if self._books != []:
736             bf = BooleanFilter()
737             for b in self._books:
738                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
739                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
740             fs.append(bf)
741
742         return Search.chain_filters(fs)
743
744     def should_search_for_book(self):
745         return self._books == []
746
747     def just_search_in(self, all):
748         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
749         some = []
750         for field in all:
751             if field == 'authors' and 'author' in self.book_tags:
752                 continue
753             if field == 'title' and self._books != []:
754                 continue
755             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
756                 continue
757             some.append(field)
758         return some
759
760
761 class Search(IndexStore):
762     """
763     Search facilities.
764     """
765     def __init__(self, default_field="content"):
766         IndexStore.__init__(self)
767         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
768         # self.analyzer = WLAnalyzer()
769         self.searcher = IndexSearcher(self.store, True)
770         self.parser = QueryParser(Version.LUCENE_34, default_field,
771                                   self.analyzer)
772
773         self.parent_filter = TermsFilter()
774         self.parent_filter.addTerm(Term("is_book", "true"))
775
776     def query(self, query):
777         """Parse query in default Lucene Syntax. (for humans)
778         """
779         return self.parser.parse(query)
780
781     def simple_search(self, query, max_results=50):
782         """Runs a query for books using lucene syntax. (for humans)
783         Returns (books, total_hits)
784         """
785
786         tops = self.searcher.search(self.query(query), max_results)
787         bks = []
788         for found in tops.scoreDocs:
789             doc = self.searcher.doc(found.doc)
790             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
791         return (bks, tops.totalHits)
792
793     def get_tokens(self, searched, field='content'):
794         """returns tokens analyzed by a proper (for a field) analyzer
795         argument can be: StringReader, string/unicode, or tokens. In the last case
796         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
797         """
798         if isinstance(searched, str) or isinstance(searched, unicode):
799             searched = StringReader(searched)
800         elif isinstance(searched, list):
801             return searched
802
803         searched.reset()
804         tokens = self.analyzer.reusableTokenStream(field, searched)
805         toks = []
806         while tokens.incrementToken():
807             cta = tokens.getAttribute(CharTermAttribute.class_)
808             toks.append(cta.toString())
809         return toks
810
811     def fuzziness(self, fuzzy):
812         """Helper method to sanitize fuzziness"""
813         if not fuzzy:
814             return None
815         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
816             return fuzzy
817         else:
818             return 0.5
819
820     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
821         """
822         Return a PhraseQuery with a series of tokens.
823         """
824         if fuzzy:
825             phrase = MultiPhraseQuery()
826             for t in tokens:
827                 term = Term(field, t)
828                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
829                 fuzzterms = []
830
831                 while True:
832                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
833                     ft = fuzzterm.term()
834                     if ft:
835                         fuzzterms.append(ft)
836                     if not fuzzterm.next(): break
837                 if fuzzterms:
838                     phrase.add(JArray('object')(fuzzterms, Term))
839                 else:
840                     phrase.add(term)
841         else:
842             phrase = PhraseQuery()
843             phrase.setSlop(slop)
844             for t in tokens:
845                 term = Term(field, t)
846                 phrase.add(term)
847         return phrase
848
849     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
850         """
851         Returns term queries joined by boolean query.
852         modal - applies to boolean query
853         fuzzy - should the query by fuzzy.
854         """
855         q = BooleanQuery()
856         for t in tokens:
857             term = Term(field, t)
858             if fuzzy:
859                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
860             else:
861                 term = TermQuery(term)
862             q.add(BooleanClause(term, modal))
863         return q
864
865     # def content_query(self, query):
866     #     return BlockJoinQuery(query, self.parent_filter,
867     #                           BlockJoinQuery.ScoreMode.Total)
868
869     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
870         """
871         Search for perfect book matches. Just see if the query matches with some author or title,
872         taking hints into account.
873         """
874         fields_to_search = ['authors', 'title']
875         only_in = None
876         if hint:
877             if not hint.should_search_for_book():
878                 return []
879             fields_to_search = hint.just_search_in(fields_to_search)
880             only_in = hint.book_filter()
881
882         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
883
884         books = []
885         for q in qrys:
886             top = self.searcher.search(q,
887                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
888                 max_results)
889             for found in top.scoreDocs:
890                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
891         return books
892
893     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
894         fields_to_search = ['tags', 'authors', 'title']
895
896         only_in = None
897         if hint:
898             if not hint.should_search_for_book():
899                 return []
900             fields_to_search = hint.just_search_in(fields_to_search)
901             only_in = hint.book_filter()
902
903         tokens = self.get_tokens(searched, field='SIMPLE')
904
905         q = BooleanQuery()
906
907         for fld in fields_to_search:
908             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
909                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
910
911         books = []
912         top = self.searcher.search(q,
913                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
914             max_results)
915         for found in top.scoreDocs:
916             books.append(SearchResult(self.searcher, found, how_found="search_book"))
917
918         return books
919
920     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
921         """
922         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
923         some part/fragment of the book.
924         """
925         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
926
927         flt = None
928         if hint:
929             flt = hint.part_filter()
930
931         books = []
932         for q in qrys:
933             top = self.searcher.search(q,
934                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
935                                                            flt]),
936                                        max_results)
937             for found in top.scoreDocs:
938                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
939
940         return books
941
942     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
943         """
944         Tries to use search terms to match different fields of book (or its parts).
945         E.g. one word can be an author survey, another be a part of the title, and the rest
946         are some words from third chapter.
947         """
948         books = []
949         only_in = None
950
951         if hint:
952             only_in = hint.part_filter()
953
954         # content only query : themes x content
955         q = BooleanQuery()
956
957         tokens_pl = self.get_tokens(searched, field='content')
958         tokens = self.get_tokens(searched, field='SIMPLE')
959
960         # only search in themes when we do not already filter by themes
961         if hint is None or hint.just_search_in(['themes']) != []:
962             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
963                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
964
965         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
966                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
967
968         topDocs = self.searcher.search(q, only_in, max_results)
969         for found in topDocs.scoreDocs:
970             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
971             print "* %s theme x content: %s" % (searched, books[-1]._hits)
972
973         # query themes/content x author/title/tags
974         q = BooleanQuery()
975         in_content = BooleanQuery()
976         in_meta = BooleanQuery()
977
978         for fld in ['themes_pl', 'content']:
979             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
980
981         for fld in ['tags', 'authors', 'title']:
982             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
983
984         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
985         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
986
987         topDocs = self.searcher.search(q, only_in, max_results)
988         for found in topDocs.scoreDocs:
989             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
990             print "* %s scatter search: %s" % (searched, books[-1]._hits)
991
992         return books
993
994     # def multisearch(self, query, max_results=50):
995     #     """
996     #     Search strategy:
997     #     - (phrase) OR -> content
998     #                   -> title
999     #                   -> authors
1000     #     - (keywords)  -> authors
1001     #                   -> motyw
1002     #                   -> tags
1003     #                   -> content
1004     #     """
1005         # queryreader = StringReader(query)
1006         # tokens = self.get_tokens(queryreader)
1007
1008         # top_level = BooleanQuery()
1009         # Should = BooleanClause.Occur.SHOULD
1010
1011         # phrase_level = BooleanQuery()
1012         # phrase_level.setBoost(1.3)
1013
1014         # p_content = self.make_phrase(tokens, joined=True)
1015         # p_title = self.make_phrase(tokens, 'title')
1016         # p_author = self.make_phrase(tokens, 'author')
1017
1018         # phrase_level.add(BooleanClause(p_content, Should))
1019         # phrase_level.add(BooleanClause(p_title, Should))
1020         # phrase_level.add(BooleanClause(p_author, Should))
1021
1022         # kw_level = BooleanQuery()
1023
1024         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1025         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1026         # kw_level.add(j_themes, Should)
1027         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1028         # j_con = self.make_term_query(tokens, joined=True)
1029         # kw_level.add(j_con, Should)
1030
1031         # top_level.add(BooleanClause(phrase_level, Should))
1032         # top_level.add(BooleanClause(kw_level, Should))
1033
1034         # return None
1035
1036     def get_snippets(self, scoreDoc, query, field='content'):
1037         """
1038         Returns a snippet for found scoreDoc.
1039         """
1040         htmlFormatter = SimpleHTMLFormatter()
1041         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1042
1043         stored = self.searcher.doc(scoreDoc.doc)
1044
1045         # locate content.
1046         snippets = Snippets(stored.get('book_id')).open()
1047         try:
1048             text = snippets.get((int(stored.get('snippets_position')),
1049                                  int(stored.get('snippets_length'))))
1050         finally:
1051             snippets.close()
1052
1053         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1054         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1055         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1056
1057         return snip
1058
1059     @staticmethod
1060     def enum_to_array(enum):
1061         """
1062         Converts a lucene TermEnum to array of Terms, suitable for
1063         addition to queries
1064         """
1065         terms = []
1066
1067         while True:
1068             t = enum.term()
1069             if t:
1070                 terms.append(t)
1071             if not enum.next(): break
1072
1073         if terms:
1074             return JArray('object')(terms, Term)
1075
1076     def search_tags(self, query, filter=None, max_results=40):
1077         """
1078         Search for Tag objects using query.
1079         """
1080         tops = self.searcher.search(query, filter, max_results)
1081
1082         tags = []
1083         for found in tops.scoreDocs:
1084             doc = self.searcher.doc(found.doc)
1085             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1086             tags.append(tag)
1087             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1088
1089         return tags
1090
1091     def search_books(self, query, filter=None, max_results=10):
1092         """
1093         Searches for Book objects using query
1094         """
1095         bks = []
1096         tops = self.searcher.search(query, filter, max_results)
1097         for found in tops.scoreDocs:
1098             doc = self.searcher.doc(found.doc)
1099             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1100         return bks
1101
1102     def create_prefix_phrase(self, toks, field):
1103         q = MultiPhraseQuery()
1104         for i in range(len(toks)):
1105             t = Term(field, toks[i])
1106             if i == len(toks) - 1:
1107                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1108                 if pterms:
1109                     q.add(pterms)
1110                 else:
1111                     q.add(t)
1112             else:
1113                 q.add(t)
1114         return q
1115
1116     @staticmethod
1117     def term_filter(term, inverse=False):
1118         only_term = TermsFilter()
1119         only_term.addTerm(term)
1120
1121         if inverse:
1122             neg = BooleanFilter()
1123             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1124             only_term = neg
1125
1126         return only_term
1127
1128     def hint_tags(self, string, max_results=50):
1129         """
1130         Return auto-complete hints for tags
1131         using prefix search.
1132         """
1133         toks = self.get_tokens(string, field='SIMPLE')
1134         top = BooleanQuery()
1135
1136         for field in ['tag_name', 'tag_name_pl']:
1137             q = self.create_prefix_phrase(toks, field)
1138             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1139
1140         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1141
1142         return self.search_tags(top, no_book_cat, max_results=max_results)
1143
1144     def hint_books(self, string, max_results=50):
1145         """
1146         Returns auto-complete hints for book titles
1147         Because we do not index 'pseudo' title-tags.
1148         Prefix search.
1149         """
1150         toks = self.get_tokens(string, field='SIMPLE')
1151
1152         q = self.create_prefix_phrase(toks, 'title')
1153
1154         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1155
1156     @staticmethod
1157     def chain_filters(filters, op=ChainedFilter.AND):
1158         """
1159         Chains a filter list together
1160         """
1161         filters = filter(lambda x: x is not None, filters)
1162         if not filters:
1163             return None
1164         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1165         return chf
1166
1167     def filtered_categories(self, tags):
1168         """
1169         Return a list of tag categories, present in tags list.
1170         """
1171         cats = {}
1172         for t in tags:
1173             cats[t.category] = True
1174         return cats.keys()
1175
1176     def hint(self):
1177         return Hint(self)