cc478ee533934ba0c016683f6eb07d0a19cfc6d4
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("authors", simple)
57         self.addAnalyzer("title", simple)
58
59         self.addAnalyzer("is_book", keyword)
60         # shouldn't the title have two forms? _pl and simple?
61
62         self.addAnalyzer("themes", simple)
63         self.addAnalyzer("themes_pl", polish)
64
65         self.addAnalyzer("tag_name", simple)
66         self.addAnalyzer("tag_name_pl", polish)
67
68         self.addAnalyzer("translators", simple)
69
70         self.addAnalyzer("KEYWORD", keyword)
71         self.addAnalyzer("SIMPLE", simple)
72         self.addAnalyzer("POLISH", polish)
73
74
75 class IndexStore(object):
76     """
77     Provides access to search index.
78
79     self.store - lucene index directory
80     """
81     def __init__(self):
82         self.make_index_dir()
83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
84
85     def make_index_dir(self):
86         try:
87             os.makedirs(settings.SEARCH_INDEX)
88         except OSError as exc:
89             if exc.errno == errno.EEXIST:
90                 pass
91             else: raise
92
93
94 class IndexChecker(IndexStore):
95     def __init__(self):
96         IndexStore.__init__(self)
97
98     def check(self):
99         checker = CheckIndex(self.store)
100         status = checker.checkIndex()
101         return status
102
103
104 class Snippets(object):
105     """
106     This class manages snippet files for indexed object (book)
107     the snippets are concatenated together, and their positions and
108     lengths are kept in lucene index fields.
109     """
110     SNIPPET_DIR = "snippets"
111
112     def __init__(self, book_id):
113         try:
114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
115         except OSError as exc:
116             if exc.errno == errno.EEXIST:
117                 pass
118             else: raise
119         self.book_id = book_id
120         self.file = None
121
122     def open(self, mode='r'):
123         """
124         Open the snippet file. Call .close() afterwards.
125         """
126         if not 'b' in mode:
127             mode += 'b'
128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
129         self.position = 0
130         return self
131
132     def add(self, snippet):
133         """
134         Append a snippet (unicode) to the snippet file.
135         Return a (position, length) tuple
136         """
137         txt = snippet.encode('utf-8')
138         l = len(txt)
139         self.file.write(txt)
140         pos = (self.position, l)
141         self.position += l
142         return pos
143
144     def get(self, pos):
145         """
146         Given a tuple of (position, length) return an unicode
147         of the snippet stored there.
148         """
149         self.file.seek(pos[0], 0)
150         txt = self.file.read(pos[1]).decode('utf-8')
151         return txt
152
153     def close(self):
154         """Close snippet file"""
155         self.file.close()
156
157
158 class BaseIndex(IndexStore):
159     """
160     Base index class.
161     Provides basic operations on index: opening, closing, optimizing.
162     """
163     def __init__(self, analyzer=None):
164         super(BaseIndex, self).__init__()
165         self.index = None
166         if not analyzer:
167             analyzer = WLAnalyzer()
168         self.analyzer = analyzer
169
170     def open(self, analyzer=None):
171         if self.index:
172             raise Exception("Index is already opened")
173         self.index = IndexWriter(self.store, self.analyzer,\
174                                  IndexWriter.MaxFieldLength.LIMITED)
175         return self.index
176
177     def optimize(self):
178         self.index.optimize()
179
180     def close(self):
181         try:
182             self.index.optimize()
183         except JavaError, je:
184             print "Error during optimize phase, check index: %s" % je
185
186         self.index.close()
187         self.index = None
188
189     def __enter__(self):
190         self.open()
191         return self
192
193     def __exit__(self, type, value, tb):
194         self.close()
195
196
197 class Index(BaseIndex):
198     """
199     Class indexing books.
200     """
201     def __init__(self, analyzer=None):
202         super(Index, self).__init__(analyzer)
203
204     def index_tags(self):
205         """
206         Re-index global tag list.
207         Removes all tags from index, then index them again.
208         Indexed fields include: id, name (with and without polish stems), category
209         """
210         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
211         self.index.deleteDocuments(q)
212
213         for tag in catalogue.models.Tag.objects.all():
214             doc = Document()
215             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
216             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
217             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
219             self.index.addDocument(doc)
220
221     def create_book_doc(self, book):
222         """
223         Create a lucene document referring book id.
224         """
225         doc = Document()
226         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
227         if book.parent is not None:
228             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
229         return doc
230
231     def remove_book(self, book):
232         """Removes a book from search index.
233         book - Book instance."""
234         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
235         self.index.deleteDocuments(q)
236
237     def index_book(self, book, book_info=None, overwrite=True):
238         """
239         Indexes the book.
240         Creates a lucene document for extracted metadata
241         and calls self.index_content() to index the contents of the book.
242         """
243         if overwrite:
244             self.remove_book(book)
245
246         book_doc = self.create_book_doc(book)
247         meta_fields = self.extract_metadata(book, book_info)
248         for f in meta_fields.values():
249             if isinstance(f, list) or isinstance(f, tuple):
250                 for elem in f:
251                     book_doc.add(elem)
252             else:
253                 book_doc.add(f)
254
255         self.index.addDocument(book_doc)
256         del book_doc
257
258         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
259
260     master_tags = [
261         'opowiadanie',
262         'powiesc',
263         'dramat_wierszowany_l',
264         'dramat_wierszowany_lp',
265         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
266         'wywiad'
267         ]
268
269     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
270
271     def extract_metadata(self, book, book_info=None):
272         """
273         Extract metadata from book and returns a map of fields keyed by fieldname
274         """
275         fields = {}
276
277         if book_info is None:
278             book_info = dcparser.parse(open(book.xml_file.path))
279
280         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
281         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
282         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
283
284         # validator, name
285         for field in dcparser.BookInfo.FIELDS:
286             if hasattr(book_info, field.name):
287                 if not getattr(book_info, field.name):
288                     continue
289                 # since no type information is available, we use validator
290                 type_indicator = field.validator
291                 if type_indicator == dcparser.as_unicode:
292                     s = getattr(book_info, field.name)
293                     if field.multiple:
294                         s = ', '.join(s)
295                     try:
296                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
297                     except JavaError as je:
298                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
299                 elif type_indicator == dcparser.as_person:
300                     p = getattr(book_info, field.name)
301                     if isinstance(p, dcparser.Person):
302                         persons = unicode(p)
303                     else:
304                         persons = ', '.join(map(unicode, p))
305                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
306                 elif type_indicator == dcparser.as_date:
307                     dt = getattr(book_info, field.name)
308                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
309                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
310
311         return fields
312
313     def add_gaps(self, fields, fieldname):
314         """
315         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
316         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
317         """
318         def gap():
319             while True:
320                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
321         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
322
323     def get_master(self, root):
324         """
325         Returns the first master tag from an etree.
326         """
327         for master in root.iter():
328             if master.tag in self.master_tags:
329                 return master
330
331     def index_content(self, book, book_fields=[]):
332         """
333         Walks the book XML and extract content from it.
334         Adds parts for each header tag and for each fragment.
335         """
336         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
337         root = wld.edoc.getroot()
338
339         master = self.get_master(root)
340         if master is None:
341             return []
342
343         def walker(node):
344             yield node, None
345             for child in list(node):
346                 for b, e in walker(child):
347                     yield b, e
348             yield None, node
349             return
350
351         def fix_format(text):
352             return re.sub("(?m)/$", "", text)
353
354         def add_part(snippets, **fields):
355             doc = self.create_book_doc(book)
356             for f in book_fields:
357                 doc.add(f)
358
359             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
360             doc.add(NumericField("header_span", Field.Store.YES, True)\
361                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
362             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
363
364             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
365                           Field.TermVector.WITH_POSITIONS_OFFSETS))
366
367             snip_pos = snippets.add(fields["content"])
368             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
369             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
370
371             if 'fragment_anchor' in fields:
372                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
373                               Field.Store.YES, Field.Index.NOT_ANALYZED))
374
375             if 'themes' in fields:
376                 themes, themes_pl = zip(*[
377                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
378                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
379                      for theme in fields['themes']])
380
381                 themes = self.add_gaps(themes, 'themes')
382                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
383
384                 for t in themes:
385                     doc.add(t)
386                 for t in themes_pl:
387                     doc.add(t)
388
389             return doc
390
391         def give_me_utf8(s):
392             if isinstance(s, unicode):
393                 return s.encode('utf-8')
394             else:
395                 return s
396
397         fragments = {}
398         snippets = Snippets(book.id).open('w')
399         try:
400             for header, position in zip(list(master), range(len(master))):
401
402                 if header.tag in self.skip_header_tags:
403                     continue
404
405                 content = u' '.join([t for t in header.itertext()])
406                 content = fix_format(content)
407
408                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
409
410                 self.index.addDocument(doc)
411
412                 for start, end in walker(header):
413                     if start is not None and start.tag == 'begin':
414                         fid = start.attrib['id'][1:]
415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
416                         fragments[fid]['content'].append(start.tail)
417                     elif start is not None and start.tag == 'motyw':
418                         fid = start.attrib['id'][1:]
419                         if start.text is not None:
420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
421                         fragments[fid]['content'].append(start.tail)
422                     elif start is not None and start.tag == 'end':
423                         fid = start.attrib['id'][1:]
424                         if fid not in fragments:
425                             continue  # a broken <end> node, skip it
426                         frag = fragments[fid]
427                         if frag['themes'] == []:
428                             continue  # empty themes list.
429                         del fragments[fid]
430
431                         def jstr(l):
432                             return u' '.join(map(
433                                 lambda x: x == None and u'(none)' or unicode(x),
434                                 l))
435
436                         doc = add_part(snippets,
437                                        header_type=frag['start_header'],
438                                        header_index=frag['start_section'],
439                                        header_span=position - frag['start_section'] + 1,
440                                        fragment_anchor=fid,
441                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
442                                        themes=frag['themes'])
443
444                         self.index.addDocument(doc)
445                     elif start is not None:
446                         for frag in fragments.values():
447                             frag['content'].append(start.text)
448                     elif end is not None:
449                         for frag in fragments.values():
450                             frag['content'].append(end.tail)
451         finally:
452             snippets.close()
453
454
455 def log_exception_wrapper(f):
456     def _wrap(*a):
457         try:
458             f(*a)
459         except Exception, e:
460             print("Error in indexing thread: %s" % e)
461             traceback.print_exc()
462             raise e
463     return _wrap
464
465
466 class ReusableIndex(Index):
467     """
468     Works like index, but does not close/optimize Lucene index
469     until program exit (uses atexit hook).
470     This is usefull for importbooks command.
471
472     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
473     """
474     index = None
475
476     def open(self, analyzer=None, threads=4):
477         if ReusableIndex.index is not None:
478             self.index = ReusableIndex.index
479         else:
480             print("opening index")
481             Index.open(self, analyzer)
482             ReusableIndex.index = self.index
483             atexit.register(ReusableIndex.close_reusable)
484
485     # def index_book(self, *args, **kw):
486     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
487     #     ReusableIndex.pool_jobs.append(job)
488
489     @staticmethod
490     def close_reusable():
491         if ReusableIndex.index is not None:
492             ReusableIndex.index.optimize()
493             ReusableIndex.index.close()
494             ReusableIndex.index = None
495
496     def close(self):
497         pass
498
499
500 class JoinSearch(object):
501     """
502     This mixin could be used to handle block join queries.
503     (currently unused)
504     """
505     def __init__(self, *args, **kw):
506         super(JoinSearch, self).__init__(*args, **kw)
507
508     def wrapjoins(self, query, fields=[]):
509         """
510         This functions modifies the query in a recursive way,
511         so Term and Phrase Queries contained, which match
512         provided fields are wrapped in a BlockJoinQuery,
513         and so delegated to children documents.
514         """
515         if BooleanQuery.instance_(query):
516             qs = BooleanQuery.cast_(query)
517             for clause in qs:
518                 clause = BooleanClause.cast_(clause)
519                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
520             return qs
521         else:
522             termset = HashSet()
523             query.extractTerms(termset)
524             for t in termset:
525                 t = Term.cast_(t)
526                 if t.field() not in fields:
527                     return query
528             return BlockJoinQuery(query, self.parent_filter,
529                                   BlockJoinQuery.ScoreMode.Total)
530
531     def bsearch(self, query, max_results=50):
532         q = self.query(query)
533         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
534
535         tops = self.searcher.search(bjq, max_results)
536         bks = []
537         for found in tops.scoreDocs:
538             doc = self.searcher.doc(found.doc)
539             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
540         return (bks, tops.totalHits)
541
542
543 class SearchResult(object):
544     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
545         if score:
546             self.score = score
547         else:
548             self.score = scoreDocs.score
549
550         self._hits = []
551         self.hits = None  # processed hits
552
553         stored = searcher.doc(scoreDocs.doc)
554         self.book_id = int(stored.get("book_id"))
555
556         header_type = stored.get("header_type")
557         if not header_type:
558             return
559
560         sec = (header_type, int(stored.get("header_index")))
561         header_span = stored.get('header_span')
562         header_span = header_span is not None and int(header_span) or 1
563
564         fragment = stored.get("fragment_anchor")
565
566         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
567
568         self._hits.append(hit)
569
570     def merge(self, other):
571         if self.book_id != other.book_id:
572             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
573         self._hits += other._hits
574         if other.score > self.score:
575             self.score = other.score
576         return self
577
578     def get_book(self):
579         return catalogue.models.Book.objects.get(id=self.book_id)
580
581     book = property(get_book)
582
583     def process_hits(self):
584         POSITION = 0
585         FRAGMENT = 1
586         POSITION_INDEX = 1
587         POSITION_SPAN = 2
588         SCORE = 2
589         OTHER = 3
590
591         # to sections and fragments
592         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
593         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
594         sect = filter(lambda s: 0 == len(filter(
595             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
596             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
597             frags)), sect)
598
599         hits = []
600
601         # remove duplicate fragments
602         fragments = {}
603         for f in frags:
604             fid = f[FRAGMENT]
605             if fid in fragments:
606                 if fragments[fid][SCORE] >= f[SCORE]:
607                     continue
608             fragments[fid] = f
609         frags = fragments.values()
610
611         # remove duplicate sections
612         sections = {}
613
614         for s in sect:
615             si = s[POSITION][POSITION_INDEX]
616             # skip existing
617             if si in sections:
618                 if sections[si]['score'] >= s[SCORE]:
619                     continue
620
621             m = {'score': s[SCORE],
622                  'section_number': s[POSITION][POSITION_INDEX] + 1,
623                  }
624             m.update(s[OTHER])
625             sections[si] = m
626
627         hits = sections.values()
628
629         for f in frags:
630             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
631             m = {'score': f[SCORE],
632                  'fragment': frag,
633                  'themes': frag.tags.filter(category='theme')
634                  }
635             m.update(f[OTHER])
636             hits.append(m)
637
638         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
639
640         self.hits = hits
641
642         return self
643
644     def __unicode__(self):
645         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
646
647     @staticmethod
648     def aggregate(*result_lists):
649         books = {}
650         for rl in result_lists:
651             for r in rl:
652                 if r.book_id in books:
653                     books[r.book_id].merge(r)
654                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
655                 else:
656                     books[r.book_id] = r
657         return books.values()
658
659     def __cmp__(self, other):
660         return cmp(self.score, other.score)
661
662
663 class Hint(object):
664     """
665     Given some hint information (information we already know about)
666     our search target - like author, title (specific book), epoch, genre, kind
667     we can narrow down search using filters.
668     """
669     def __init__(self, search):
670         """
671         Accepts a Searcher instance.
672         """
673         self.search = search
674         self.book_tags = {}
675         self.part_tags = []
676         self._books = []
677
678     def books(self, *books):
679         """
680         Give a hint that we search these books.
681         """
682         self._books = books
683
684     def tags(self, tags):
685         """
686         Give a hint that these Tag objects (a list of)
687         is necessary.
688         """
689         for t in tags:
690             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
691                 lst = self.book_tags.get(t.category, [])
692                 lst.append(t)
693                 self.book_tags[t.category] = lst
694             if t.category in ['theme', 'theme_pl']:
695                 self.part_tags.append(t)
696
697     def tag_filter(self, tags, field='tags'):
698         """
699         Given a lsit of tags and an optional field (but they are normally in tags field)
700         returns a filter accepting only books with specific tags.
701         """
702         q = BooleanQuery()
703
704         for tag in tags:
705             toks = self.search.get_tokens(tag.name, field=field)
706             tag_phrase = PhraseQuery()
707             for tok in toks:
708                 tag_phrase.add(Term(field, tok))
709             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
710
711         return QueryWrapperFilter(q)
712
713     def book_filter(self):
714         """
715         Filters using book tags (all tag kinds except a theme)
716         """
717         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
718         if tags:
719             return self.tag_filter(tags)
720         else:
721             return None
722
723     def part_filter(self):
724         """
725         This filter can be used to look for book parts.
726         It filters on book id and/or themes.
727         """
728         fs = []
729         if self.part_tags:
730             fs.append(self.tag_filter(self.part_tags, field='themes'))
731
732         if self._books != []:
733             bf = BooleanFilter()
734             for b in self._books:
735                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
736                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
737             fs.append(bf)
738
739         return Search.chain_filters(fs)
740
741     def should_search_for_book(self):
742         return self._books == []
743
744     def just_search_in(self, all):
745         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
746         some = []
747         for field in all:
748             if field == 'authors' and 'author' in self.book_tags:
749                 continue
750             if field == 'title' and self._books != []:
751                 continue
752             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
753                 continue
754             some.append(field)
755         return some
756
757
758 class Search(IndexStore):
759     """
760     Search facilities.
761     """
762     def __init__(self, default_field="content"):
763         IndexStore.__init__(self)
764         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
765         # self.analyzer = WLAnalyzer()
766         self.searcher = IndexSearcher(self.store, True)
767         self.parser = QueryParser(Version.LUCENE_34, default_field,
768                                   self.analyzer)
769
770         self.parent_filter = TermsFilter()
771         self.parent_filter.addTerm(Term("is_book", "true"))
772
773     def query(self, query):
774         """Parse query in default Lucene Syntax. (for humans)
775         """
776         return self.parser.parse(query)
777
778     def simple_search(self, query, max_results=50):
779         """Runs a query for books using lucene syntax. (for humans)
780         Returns (books, total_hits)
781         """
782
783         tops = self.searcher.search(self.query(query), max_results)
784         bks = []
785         for found in tops.scoreDocs:
786             doc = self.searcher.doc(found.doc)
787             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
788         return (bks, tops.totalHits)
789
790     def get_tokens(self, searched, field='content'):
791         """returns tokens analyzed by a proper (for a field) analyzer
792         argument can be: StringReader, string/unicode, or tokens. In the last case
793         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
794         """
795         if isinstance(searched, str) or isinstance(searched, unicode):
796             searched = StringReader(searched)
797         elif isinstance(searched, list):
798             return searched
799
800         searched.reset()
801         tokens = self.analyzer.reusableTokenStream(field, searched)
802         toks = []
803         while tokens.incrementToken():
804             cta = tokens.getAttribute(CharTermAttribute.class_)
805             toks.append(cta.toString())
806         return toks
807
808     def fuzziness(self, fuzzy):
809         """Helper method to sanitize fuzziness"""
810         if not fuzzy:
811             return None
812         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
813             return fuzzy
814         else:
815             return 0.5
816
817     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
818         """
819         Return a PhraseQuery with a series of tokens.
820         """
821         if fuzzy:
822             phrase = MultiPhraseQuery()
823             for t in tokens:
824                 term = Term(field, t)
825                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
826                 fuzzterms = []
827
828                 while True:
829                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
830                     ft = fuzzterm.term()
831                     if ft:
832                         fuzzterms.append(ft)
833                     if not fuzzterm.next(): break
834                 if fuzzterms:
835                     phrase.add(JArray('object')(fuzzterms, Term))
836                 else:
837                     phrase.add(term)
838         else:
839             phrase = PhraseQuery()
840             phrase.setSlop(slop)
841             for t in tokens:
842                 term = Term(field, t)
843                 phrase.add(term)
844         return phrase
845
846     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
847         """
848         Returns term queries joined by boolean query.
849         modal - applies to boolean query
850         fuzzy - should the query by fuzzy.
851         """
852         q = BooleanQuery()
853         for t in tokens:
854             term = Term(field, t)
855             if fuzzy:
856                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
857             else:
858                 term = TermQuery(term)
859             q.add(BooleanClause(term, modal))
860         return q
861
862     # def content_query(self, query):
863     #     return BlockJoinQuery(query, self.parent_filter,
864     #                           BlockJoinQuery.ScoreMode.Total)
865
866     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
867         """
868         Search for perfect book matches. Just see if the query matches with some author or title,
869         taking hints into account.
870         """
871         fields_to_search = ['authors', 'title']
872         only_in = None
873         if hint:
874             if not hint.should_search_for_book():
875                 return []
876             fields_to_search = hint.just_search_in(fields_to_search)
877             only_in = hint.book_filter()
878
879         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
880
881         books = []
882         for q in qrys:
883             top = self.searcher.search(q,
884                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
885                 max_results)
886             for found in top.scoreDocs:
887                 books.append(SearchResult(self.searcher, found))
888         return books
889
890     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
891         fields_to_search = ['tags', 'authors', 'title']
892
893         only_in = None
894         if hint:
895             if not hint.should_search_for_book():
896                 return []
897             fields_to_search = hint.just_search_in(fields_to_search)
898             only_in = hint.book_filter()
899
900         tokens = self.get_tokens(searched, field='SIMPLE')
901
902         q = BooleanQuery()
903
904         for fld in fields_to_search:
905             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
906                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
907
908         books = []
909         top = self.searcher.search(q,
910                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
911             max_results)
912         for found in top.scoreDocs:
913             books.append(SearchResult(self.searcher, found))
914
915         return books
916
917     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
918         """
919         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
920         some part/fragment of the book.
921         """
922         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
923
924         flt = None
925         if hint:
926             flt = hint.part_filter()
927
928         books = []
929         for q in qrys:
930             top = self.searcher.search(q,
931                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
932                                                            flt]),
933                                        max_results)
934             for found in top.scoreDocs:
935                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
936
937         return books
938
939     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
940         """
941         Tries to use search terms to match different fields of book (or its parts).
942         E.g. one word can be an author survey, another be a part of the title, and the rest
943         are some words from third chapter.
944         """
945         books = []
946         only_in = None
947
948         if hint:
949             only_in = hint.part_filter()
950
951         # content only query : themes x content
952         q = BooleanQuery()
953
954         tokens_pl = self.get_tokens(searched, field='content')
955         tokens = self.get_tokens(searched, field='SIMPLE')
956
957         # only search in themes when we do not already filter by themes
958         if hint is None or hint.just_search_in(['themes']) != []:
959             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
960                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
961
962         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
963                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
964
965         topDocs = self.searcher.search(q, only_in, max_results)
966         for found in topDocs.scoreDocs:
967             books.append(SearchResult(self.searcher, found))
968             print "* %s theme x content: %s" % (searched, books[-1]._hits)
969
970         # query themes/content x author/title/tags
971         q = BooleanQuery()
972         in_content = BooleanQuery()
973         in_meta = BooleanQuery()
974
975         for fld in ['themes_pl', 'content']:
976             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
977
978         for fld in ['tags', 'authors', 'title']:
979             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
980
981         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
982         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
983
984         topDocs = self.searcher.search(q, only_in, max_results)
985         for found in topDocs.scoreDocs:
986             books.append(SearchResult(self.searcher, found))
987             print "* %s scatter search: %s" % (searched, books[-1]._hits)
988
989         return books
990
991     # def multisearch(self, query, max_results=50):
992     #     """
993     #     Search strategy:
994     #     - (phrase) OR -> content
995     #                   -> title
996     #                   -> authors
997     #     - (keywords)  -> authors
998     #                   -> motyw
999     #                   -> tags
1000     #                   -> content
1001     #     """
1002         # queryreader = StringReader(query)
1003         # tokens = self.get_tokens(queryreader)
1004
1005         # top_level = BooleanQuery()
1006         # Should = BooleanClause.Occur.SHOULD
1007
1008         # phrase_level = BooleanQuery()
1009         # phrase_level.setBoost(1.3)
1010
1011         # p_content = self.make_phrase(tokens, joined=True)
1012         # p_title = self.make_phrase(tokens, 'title')
1013         # p_author = self.make_phrase(tokens, 'author')
1014
1015         # phrase_level.add(BooleanClause(p_content, Should))
1016         # phrase_level.add(BooleanClause(p_title, Should))
1017         # phrase_level.add(BooleanClause(p_author, Should))
1018
1019         # kw_level = BooleanQuery()
1020
1021         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1022         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1023         # kw_level.add(j_themes, Should)
1024         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1025         # j_con = self.make_term_query(tokens, joined=True)
1026         # kw_level.add(j_con, Should)
1027
1028         # top_level.add(BooleanClause(phrase_level, Should))
1029         # top_level.add(BooleanClause(kw_level, Should))
1030
1031         # return None
1032
1033     def get_snippets(self, scoreDoc, query, field='content'):
1034         """
1035         Returns a snippet for found scoreDoc.
1036         """
1037         htmlFormatter = SimpleHTMLFormatter()
1038         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1039
1040         stored = self.searcher.doc(scoreDoc.doc)
1041
1042         # locate content.
1043         snippets = Snippets(stored.get('book_id')).open()
1044         try:
1045             text = snippets.get((int(stored.get('snippets_position')),
1046                                  int(stored.get('snippets_length'))))
1047         finally:
1048             snippets.close()
1049
1050         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1051         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1052         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1053
1054         return snip
1055
1056     @staticmethod
1057     def enum_to_array(enum):
1058         """
1059         Converts a lucene TermEnum to array of Terms, suitable for
1060         addition to queries
1061         """
1062         terms = []
1063
1064         while True:
1065             t = enum.term()
1066             if t:
1067                 terms.append(t)
1068             if not enum.next(): break
1069
1070         if terms:
1071             return JArray('object')(terms, Term)
1072
1073     def search_tags(self, query, filter=None, max_results=40):
1074         """
1075         Search for Tag objects using query.
1076         """
1077         tops = self.searcher.search(query, filter, max_results)
1078
1079         tags = []
1080         for found in tops.scoreDocs:
1081             doc = self.searcher.doc(found.doc)
1082             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1083             tags.append(tag)
1084             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1085
1086         return tags
1087
1088     def search_books(self, query, filter=None, max_results=10):
1089         """
1090         Searches for Book objects using query
1091         """
1092         bks = []
1093         tops = self.searcher.search(query, filter, max_results)
1094         for found in tops.scoreDocs:
1095             doc = self.searcher.doc(found.doc)
1096             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1097         return bks
1098
1099     def create_prefix_phrase(self, toks, field):
1100         q = MultiPhraseQuery()
1101         for i in range(len(toks)):
1102             t = Term(field, toks[i])
1103             if i == len(toks) - 1:
1104                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1105                 if pterms:
1106                     q.add(pterms)
1107                 else:
1108                     q.add(t)
1109             else:
1110                 q.add(t)
1111         return q
1112
1113     @staticmethod
1114     def term_filter(term, inverse=False):
1115         only_term = TermsFilter()
1116         only_term.addTerm(term)
1117
1118         if inverse:
1119             neg = BooleanFilter()
1120             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1121             only_term = neg
1122
1123         return only_term
1124
1125     def hint_tags(self, string, max_results=50):
1126         """
1127         Return auto-complete hints for tags
1128         using prefix search.
1129         """
1130         toks = self.get_tokens(string, field='SIMPLE')
1131         top = BooleanQuery()
1132
1133         for field in ['tag_name', 'tag_name_pl']:
1134             q = self.create_prefix_phrase(toks, field)
1135             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1136
1137         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1138
1139         return self.search_tags(top, no_book_cat, max_results=max_results)
1140
1141     def hint_books(self, string, max_results=50):
1142         """
1143         Returns auto-complete hints for book titles
1144         Because we do not index 'pseudo' title-tags.
1145         Prefix search.
1146         """
1147         toks = self.get_tokens(string, field='SIMPLE')
1148
1149         q = self.create_prefix_phrase(toks, 'title')
1150
1151         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1152
1153     @staticmethod
1154     def chain_filters(filters, op=ChainedFilter.AND):
1155         """
1156         Chains a filter list together
1157         """
1158         filters = filter(lambda x: x is not None, filters)
1159         if not filters:
1160             return None
1161         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1162         return chf
1163
1164     def filtered_categories(self, tags):
1165         """
1166         Return a list of tag categories, present in tags list.
1167         """
1168         cats = {}
1169         for t in tags:
1170             cats[t.category] = True
1171         return cats.keys()
1172
1173     def hint(self):
1174         return Hint(self)