Fixed collecting content for snippet generation
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("authors", simple)
57         self.addAnalyzer("title", simple)
58
59         self.addAnalyzer("is_book", keyword)
60         # shouldn't the title have two forms? _pl and simple?
61
62         self.addAnalyzer("themes", simple)
63         self.addAnalyzer("themes_pl", polish)
64
65         self.addAnalyzer("tag_name", simple)
66         self.addAnalyzer("tag_name_pl", polish)
67
68         self.addAnalyzer("translators", simple)
69
70         self.addAnalyzer("KEYWORD", keyword)
71         self.addAnalyzer("SIMPLE", simple)
72         self.addAnalyzer("POLISH", polish)
73
74
75 class IndexStore(object):
76     """
77     Provides access to search index.
78
79     self.store - lucene index directory
80     """
81     def __init__(self):
82         self.make_index_dir()
83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
84
85     def make_index_dir(self):
86         try:
87             os.makedirs(settings.SEARCH_INDEX)
88         except OSError as exc:
89             if exc.errno == errno.EEXIST:
90                 pass
91             else: raise
92
93
94 class IndexChecker(IndexStore):
95     def __init__(self):
96         IndexStore.__init__(self)
97
98     def check(self):
99         checker = CheckIndex(self.store)
100         status = checker.checkIndex()
101         return status
102
103
104 class Snippets(object):
105     """
106     This class manages snippet files for indexed object (book)
107     the snippets are concatenated together, and their positions and
108     lengths are kept in lucene index fields.
109     """
110     SNIPPET_DIR = "snippets"
111
112     def __init__(self, book_id):
113         try:
114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
115         except OSError as exc:
116             if exc.errno == errno.EEXIST:
117                 pass
118             else: raise
119         self.book_id = book_id
120         self.file = None
121
122     def open(self, mode='r'):
123         """
124         Open the snippet file. Call .close() afterwards.
125         """
126         if not 'b' in mode:
127             mode += 'b'
128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
129         self.position = 0
130         return self
131
132     def add(self, snippet):
133         """
134         Append a snippet (unicode) to the snippet file.
135         Return a (position, length) tuple
136         """
137         txt = snippet.encode('utf-8')
138         l = len(txt)
139         self.file.write(txt)
140         pos = (self.position, l)
141         self.position += l
142         print "SSSS %s - %s" % (pos, txt)
143         return pos
144
145     def get(self, pos):
146         """
147         Given a tuple of (position, length) return an unicode
148         of the snippet stored there.
149         """
150         self.file.seek(pos[0], 0)
151         txt = self.file.read(pos[1]).decode('utf-8')
152         return txt
153
154     def close(self):
155         """Close snippet file"""
156         self.file.close()
157
158
159 class BaseIndex(IndexStore):
160     """
161     Base index class.
162     Provides basic operations on index: opening, closing, optimizing.
163     """
164     def __init__(self, analyzer=None):
165         super(BaseIndex, self).__init__()
166         self.index = None
167         if not analyzer:
168             analyzer = WLAnalyzer()
169         self.analyzer = analyzer
170
171     def open(self, analyzer=None):
172         if self.index:
173             raise Exception("Index is already opened")
174         self.index = IndexWriter(self.store, self.analyzer,\
175                                  IndexWriter.MaxFieldLength.LIMITED)
176         return self.index
177
178     def optimize(self):
179         self.index.optimize()
180
181     def close(self):
182         try:
183             self.index.optimize()
184         except JavaError, je:
185             print "Error during optimize phase, check index: %s" % je
186
187         self.index.close()
188         self.index = None
189
190     def __enter__(self):
191         self.open()
192         return self
193
194     def __exit__(self, type, value, tb):
195         self.close()
196
197
198 class Index(BaseIndex):
199     """
200     Class indexing books.
201     """
202     def __init__(self, analyzer=None):
203         super(Index, self).__init__(analyzer)
204
205     def index_tags(self):
206         """
207         Re-index global tag list.
208         Removes all tags from index, then index them again.
209         Indexed fields include: id, name (with and without polish stems), category
210         """
211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
212         self.index.deleteDocuments(q)
213
214         for tag in catalogue.models.Tag.objects.all():
215             doc = Document()
216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
220             self.index.addDocument(doc)
221
222     def create_book_doc(self, book):
223         """
224         Create a lucene document referring book id.
225         """
226         doc = Document()
227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
228         if book.parent is not None:
229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
230         return doc
231
232     def remove_book(self, book):
233         """Removes a book from search index.
234         book - Book instance."""
235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
236         self.index.deleteDocuments(q)
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             self.remove_book(book)
246
247         book_doc = self.create_book_doc(book)
248         meta_fields = self.extract_metadata(book, book_info)
249         for f in meta_fields.values():
250             if isinstance(f, list) or isinstance(f, tuple):
251                 for elem in f:
252                     book_doc.add(elem)
253             else:
254                 book_doc.add(f)
255
256         self.index.addDocument(book_doc)
257         del book_doc
258
259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
260
261     master_tags = [
262         'opowiadanie',
263         'powiesc',
264         'dramat_wierszowany_l',
265         'dramat_wierszowany_lp',
266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267         'wywiad'
268         ]
269
270     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
271
272     def extract_metadata(self, book, book_info=None):
273         """
274         Extract metadata from book and returns a map of fields keyed by fieldname
275         """
276         fields = {}
277
278         if book_info is None:
279             book_info = dcparser.parse(open(book.xml_file.path))
280
281         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
282         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
283         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
284
285         # validator, name
286         for field in dcparser.BookInfo.FIELDS:
287             if hasattr(book_info, field.name):
288                 if not getattr(book_info, field.name):
289                     continue
290                 # since no type information is available, we use validator
291                 type_indicator = field.validator
292                 if type_indicator == dcparser.as_unicode:
293                     s = getattr(book_info, field.name)
294                     if field.multiple:
295                         s = ', '.join(s)
296                     try:
297                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
298                     except JavaError as je:
299                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
300                 elif type_indicator == dcparser.as_person:
301                     p = getattr(book_info, field.name)
302                     if isinstance(p, dcparser.Person):
303                         persons = unicode(p)
304                     else:
305                         persons = ', '.join(map(unicode, p))
306                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
307                 elif type_indicator == dcparser.as_date:
308                     dt = getattr(book_info, field.name)
309                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
310                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
311
312         return fields
313
314     def add_gaps(self, fields, fieldname):
315         """
316         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
317         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
318         """
319         def gap():
320             while True:
321                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
322         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
323
324     def get_master(self, root):
325         """
326         Returns the first master tag from an etree.
327         """
328         for master in root.iter():
329             if master.tag in self.master_tags:
330                 return master
331
332     def index_content(self, book, book_fields=[]):
333         """
334         Walks the book XML and extract content from it.
335         Adds parts for each header tag and for each fragment.
336         """
337         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
338         root = wld.edoc.getroot()
339
340         master = self.get_master(root)
341         if master is None:
342             return []
343
344         def walker(node):
345             yield node, None
346             for child in list(node):
347                 for b, e in walker(child):
348                     yield b, e
349             yield None, node
350             return
351
352         def fix_format(text):
353             return re.sub("(?m)/$", "", text)
354
355         def add_part(snippets, **fields):
356             doc = self.create_book_doc(book)
357             for f in book_fields:
358                 doc.add(f)
359
360             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
361             doc.add(NumericField("header_span", Field.Store.YES, True)\
362                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
363             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
364
365             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
366                           Field.TermVector.WITH_POSITIONS_OFFSETS))
367
368             snip_pos = snippets.add(fields["content"])
369             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
370             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
371
372             if 'fragment_anchor' in fields:
373                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
374                               Field.Store.YES, Field.Index.NOT_ANALYZED))
375
376             if 'themes' in fields:
377                 themes, themes_pl = zip(*[
378                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
379                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
380                      for theme in fields['themes']])
381
382                 themes = self.add_gaps(themes, 'themes')
383                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
384
385                 for t in themes:
386                     doc.add(t)
387                 for t in themes_pl:
388                     doc.add(t)
389
390             return doc
391
392         def give_me_utf8(s):
393             if isinstance(s, unicode):
394                 return s.encode('utf-8')
395             else:
396                 return s
397
398         fragments = {}
399         snippets = Snippets(book.id).open('w')
400         try:
401             for header, position in zip(list(master), range(len(master))):
402
403                 if header.tag in self.skip_header_tags:
404                     continue
405
406                 # section content
407                 content = []
408
409                 for start, end in walker(header):
410                         # handle fragments and themes.
411                     if start is not None and start.tag == 'begin':
412                         fid = start.attrib['id'][1:]
413                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
414
415                     elif start is not None and start.tag == 'motyw':
416                         fid = start.attrib['id'][1:]
417                         if start.text is not None:
418                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
419
420                     elif start is not None and start.tag == 'end':
421                         fid = start.attrib['id'][1:]
422                         if fid not in fragments:
423                             continue  # a broken <end> node, skip it
424                                       #                        import pdb; pdb.set_trace()
425                         frag = fragments[fid]
426                         if frag['themes'] == []:
427                             continue  # empty themes list.
428                         del fragments[fid]
429
430                         def jstr(l):
431                             return u' '.join(map(
432                                 lambda x: x == None and u'(none)' or unicode(x),
433                                 l))
434
435                         doc = add_part(snippets,
436                                        header_type=frag['start_header'],
437                                        header_index=frag['start_section'],
438                                        header_span=position - frag['start_section'] + 1,
439                                        fragment_anchor=fid,
440                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
441                                        themes=frag['themes'])
442
443                         self.index.addDocument(doc)
444
445                         # Collect content.
446                     elif start is not None:
447                         for frag in fragments.values():
448                             frag['content'].append(start.text)
449                         content.append(start.text)
450                     elif end is not None:
451                         for frag in fragments.values():
452                             frag['content'].append(end.tail)
453                         content.append(end.tail)
454
455                         # in the end, add a section text.
456                 doc = add_part(snippets, header_index=position, header_type=header.tag,
457                                content=fix_format(u' '.join(filter(lambda s: s is not None, frag['content']))))
458
459                 self.index.addDocument(doc)
460
461         finally:
462             snippets.close()
463
464
465 def log_exception_wrapper(f):
466     def _wrap(*a):
467         try:
468             f(*a)
469         except Exception, e:
470             print("Error in indexing thread: %s" % e)
471             traceback.print_exc()
472             raise e
473     return _wrap
474
475
476 class ReusableIndex(Index):
477     """
478     Works like index, but does not close/optimize Lucene index
479     until program exit (uses atexit hook).
480     This is usefull for importbooks command.
481
482     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
483     """
484     index = None
485
486     def open(self, analyzer=None, threads=4):
487         if ReusableIndex.index is not None:
488             self.index = ReusableIndex.index
489         else:
490             print("opening index")
491             Index.open(self, analyzer)
492             ReusableIndex.index = self.index
493             atexit.register(ReusableIndex.close_reusable)
494
495     # def index_book(self, *args, **kw):
496     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
497     #     ReusableIndex.pool_jobs.append(job)
498
499     @staticmethod
500     def close_reusable():
501         if ReusableIndex.index is not None:
502             ReusableIndex.index.optimize()
503             ReusableIndex.index.close()
504             ReusableIndex.index = None
505
506     def close(self):
507         pass
508
509
510 class JoinSearch(object):
511     """
512     This mixin could be used to handle block join queries.
513     (currently unused)
514     """
515     def __init__(self, *args, **kw):
516         super(JoinSearch, self).__init__(*args, **kw)
517
518     def wrapjoins(self, query, fields=[]):
519         """
520         This functions modifies the query in a recursive way,
521         so Term and Phrase Queries contained, which match
522         provided fields are wrapped in a BlockJoinQuery,
523         and so delegated to children documents.
524         """
525         if BooleanQuery.instance_(query):
526             qs = BooleanQuery.cast_(query)
527             for clause in qs:
528                 clause = BooleanClause.cast_(clause)
529                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
530             return qs
531         else:
532             termset = HashSet()
533             query.extractTerms(termset)
534             for t in termset:
535                 t = Term.cast_(t)
536                 if t.field() not in fields:
537                     return query
538             return BlockJoinQuery(query, self.parent_filter,
539                                   BlockJoinQuery.ScoreMode.Total)
540
541     def bsearch(self, query, max_results=50):
542         q = self.query(query)
543         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
544
545         tops = self.searcher.search(bjq, max_results)
546         bks = []
547         for found in tops.scoreDocs:
548             doc = self.searcher.doc(found.doc)
549             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
550         return (bks, tops.totalHits)
551
552
553 class SearchResult(object):
554     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
555         if score:
556             self.score = score
557         else:
558             self.score = scoreDocs.score
559
560         self._hits = []
561         self.hits = None  # processed hits
562
563         stored = searcher.doc(scoreDocs.doc)
564         self.book_id = int(stored.get("book_id"))
565
566         header_type = stored.get("header_type")
567         if not header_type:
568             return
569
570         sec = (header_type, int(stored.get("header_index")))
571         header_span = stored.get('header_span')
572         header_span = header_span is not None and int(header_span) or 1
573
574         fragment = stored.get("fragment_anchor")
575
576         if snippets:
577             snippets = snippets.replace("/\n", "\n")
578         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
579
580         self._hits.append(hit)
581
582     def merge(self, other):
583         if self.book_id != other.book_id:
584             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
585         self._hits += other._hits
586         if other.score > self.score:
587             self.score = other.score
588         return self
589
590     def get_book(self):
591         return catalogue.models.Book.objects.get(id=self.book_id)
592
593     book = property(get_book)
594
595     def process_hits(self):
596         POSITION = 0
597         FRAGMENT = 1
598         POSITION_INDEX = 1
599         POSITION_SPAN = 2
600         SCORE = 2
601         OTHER = 3
602
603         # to sections and fragments
604         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
605         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
606         sect = filter(lambda s: 0 == len(filter(
607             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
608             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
609             frags)), sect)
610
611         hits = []
612
613         # remove duplicate fragments
614         fragments = {}
615         for f in frags:
616             fid = f[FRAGMENT]
617             if fid in fragments:
618                 if fragments[fid][SCORE] >= f[SCORE]:
619                     continue
620             fragments[fid] = f
621         frags = fragments.values()
622
623         # remove duplicate sections
624         sections = {}
625
626         for s in sect:
627             si = s[POSITION][POSITION_INDEX]
628             # skip existing
629             if si in sections:
630                 if sections[si]['score'] >= s[SCORE]:
631                     continue
632
633             m = {'score': s[SCORE],
634                  'section_number': s[POSITION][POSITION_INDEX] + 1,
635                  }
636             m.update(s[OTHER])
637             sections[si] = m
638
639         hits = sections.values()
640
641         for f in frags:
642             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
643             m = {'score': f[SCORE],
644                  'fragment': frag,
645                  'section_number': f[POSITION][POSITION_INDEX] + 1,
646                  'themes': frag.tags.filter(category='theme')
647                  }
648             m.update(f[OTHER])
649             hits.append(m)
650
651         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
652
653         self.hits = hits
654
655         return self
656
657     def __unicode__(self):
658         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
659
660     @staticmethod
661     def aggregate(*result_lists):
662         books = {}
663         for rl in result_lists:
664             for r in rl:
665                 if r.book_id in books:
666                     books[r.book_id].merge(r)
667                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
668                 else:
669                     books[r.book_id] = r
670         return books.values()
671
672     def __cmp__(self, other):
673         return cmp(self.score, other.score)
674
675
676 class Hint(object):
677     """
678     Given some hint information (information we already know about)
679     our search target - like author, title (specific book), epoch, genre, kind
680     we can narrow down search using filters.
681     """
682     def __init__(self, search):
683         """
684         Accepts a Searcher instance.
685         """
686         self.search = search
687         self.book_tags = {}
688         self.part_tags = []
689         self._books = []
690
691     def books(self, *books):
692         """
693         Give a hint that we search these books.
694         """
695         self._books = books
696
697     def tags(self, tags):
698         """
699         Give a hint that these Tag objects (a list of)
700         is necessary.
701         """
702         for t in tags:
703             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
704                 lst = self.book_tags.get(t.category, [])
705                 lst.append(t)
706                 self.book_tags[t.category] = lst
707             if t.category in ['theme', 'theme_pl']:
708                 self.part_tags.append(t)
709
710     def tag_filter(self, tags, field='tags'):
711         """
712         Given a lsit of tags and an optional field (but they are normally in tags field)
713         returns a filter accepting only books with specific tags.
714         """
715         q = BooleanQuery()
716
717         for tag in tags:
718             toks = self.search.get_tokens(tag.name, field=field)
719             tag_phrase = PhraseQuery()
720             for tok in toks:
721                 tag_phrase.add(Term(field, tok))
722             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
723
724         return QueryWrapperFilter(q)
725
726     def book_filter(self):
727         """
728         Filters using book tags (all tag kinds except a theme)
729         """
730         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
731         if tags:
732             return self.tag_filter(tags)
733         else:
734             return None
735
736     def part_filter(self):
737         """
738         This filter can be used to look for book parts.
739         It filters on book id and/or themes.
740         """
741         fs = []
742         if self.part_tags:
743             fs.append(self.tag_filter(self.part_tags, field='themes'))
744
745         if self._books != []:
746             bf = BooleanFilter()
747             for b in self._books:
748                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
749                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
750             fs.append(bf)
751
752         return Search.chain_filters(fs)
753
754     def should_search_for_book(self):
755         return self._books == []
756
757     def just_search_in(self, all):
758         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
759         some = []
760         for field in all:
761             if field == 'authors' and 'author' in self.book_tags:
762                 continue
763             if field == 'title' and self._books != []:
764                 continue
765             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
766                 continue
767             some.append(field)
768         return some
769
770
771 class Search(IndexStore):
772     """
773     Search facilities.
774     """
775     def __init__(self, default_field="content"):
776         IndexStore.__init__(self)
777         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
778         # self.analyzer = WLAnalyzer()
779         self.searcher = IndexSearcher(self.store, True)
780         self.parser = QueryParser(Version.LUCENE_34, default_field,
781                                   self.analyzer)
782
783         self.parent_filter = TermsFilter()
784         self.parent_filter.addTerm(Term("is_book", "true"))
785
786     def query(self, query):
787         """Parse query in default Lucene Syntax. (for humans)
788         """
789         return self.parser.parse(query)
790
791     def simple_search(self, query, max_results=50):
792         """Runs a query for books using lucene syntax. (for humans)
793         Returns (books, total_hits)
794         """
795
796         tops = self.searcher.search(self.query(query), max_results)
797         bks = []
798         for found in tops.scoreDocs:
799             doc = self.searcher.doc(found.doc)
800             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
801         return (bks, tops.totalHits)
802
803     def get_tokens(self, searched, field='content'):
804         """returns tokens analyzed by a proper (for a field) analyzer
805         argument can be: StringReader, string/unicode, or tokens. In the last case
806         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
807         """
808         if isinstance(searched, str) or isinstance(searched, unicode):
809             searched = StringReader(searched)
810         elif isinstance(searched, list):
811             return searched
812
813         searched.reset()
814         tokens = self.analyzer.reusableTokenStream(field, searched)
815         toks = []
816         while tokens.incrementToken():
817             cta = tokens.getAttribute(CharTermAttribute.class_)
818             toks.append(cta.toString())
819         return toks
820
821     def fuzziness(self, fuzzy):
822         """Helper method to sanitize fuzziness"""
823         if not fuzzy:
824             return None
825         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
826             return fuzzy
827         else:
828             return 0.5
829
830     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
831         """
832         Return a PhraseQuery with a series of tokens.
833         """
834         if fuzzy:
835             phrase = MultiPhraseQuery()
836             for t in tokens:
837                 term = Term(field, t)
838                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
839                 fuzzterms = []
840
841                 while True:
842                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
843                     ft = fuzzterm.term()
844                     if ft:
845                         fuzzterms.append(ft)
846                     if not fuzzterm.next(): break
847                 if fuzzterms:
848                     phrase.add(JArray('object')(fuzzterms, Term))
849                 else:
850                     phrase.add(term)
851         else:
852             phrase = PhraseQuery()
853             phrase.setSlop(slop)
854             for t in tokens:
855                 term = Term(field, t)
856                 phrase.add(term)
857         return phrase
858
859     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
860         """
861         Returns term queries joined by boolean query.
862         modal - applies to boolean query
863         fuzzy - should the query by fuzzy.
864         """
865         q = BooleanQuery()
866         for t in tokens:
867             term = Term(field, t)
868             if fuzzy:
869                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
870             else:
871                 term = TermQuery(term)
872             q.add(BooleanClause(term, modal))
873         return q
874
875     # def content_query(self, query):
876     #     return BlockJoinQuery(query, self.parent_filter,
877     #                           BlockJoinQuery.ScoreMode.Total)
878
879     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
880         """
881         Search for perfect book matches. Just see if the query matches with some author or title,
882         taking hints into account.
883         """
884         fields_to_search = ['authors', 'title']
885         only_in = None
886         if hint:
887             if not hint.should_search_for_book():
888                 return []
889             fields_to_search = hint.just_search_in(fields_to_search)
890             only_in = hint.book_filter()
891
892         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
893
894         books = []
895         for q in qrys:
896             top = self.searcher.search(q,
897                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
898                 max_results)
899             for found in top.scoreDocs:
900                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
901         return books
902
903     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
904         fields_to_search = ['tags', 'authors', 'title']
905
906         only_in = None
907         if hint:
908             if not hint.should_search_for_book():
909                 return []
910             fields_to_search = hint.just_search_in(fields_to_search)
911             only_in = hint.book_filter()
912
913         tokens = self.get_tokens(searched, field='SIMPLE')
914
915         q = BooleanQuery()
916
917         for fld in fields_to_search:
918             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
919                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
920
921         books = []
922         top = self.searcher.search(q,
923                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
924             max_results)
925         for found in top.scoreDocs:
926             books.append(SearchResult(self.searcher, found, how_found="search_book"))
927
928         return books
929
930     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
931         """
932         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
933         some part/fragment of the book.
934         """
935         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
936
937         flt = None
938         if hint:
939             flt = hint.part_filter()
940
941         books = []
942         for q in qrys:
943             top = self.searcher.search(q,
944                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
945                                                            flt]),
946                                        max_results)
947             for found in top.scoreDocs:
948                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
949
950         return books
951
952     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
953         """
954         Tries to use search terms to match different fields of book (or its parts).
955         E.g. one word can be an author survey, another be a part of the title, and the rest
956         are some words from third chapter.
957         """
958         books = []
959         only_in = None
960
961         if hint:
962             only_in = hint.part_filter()
963
964         # content only query : themes x content
965         q = BooleanQuery()
966
967         tokens_pl = self.get_tokens(searched, field='content')
968         tokens = self.get_tokens(searched, field='SIMPLE')
969
970         # only search in themes when we do not already filter by themes
971         if hint is None or hint.just_search_in(['themes']) != []:
972             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
973                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
974
975         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
976                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
977
978         topDocs = self.searcher.search(q, only_in, max_results)
979         for found in topDocs.scoreDocs:
980             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
981             print "* %s theme x content: %s" % (searched, books[-1]._hits)
982
983         # query themes/content x author/title/tags
984         q = BooleanQuery()
985         in_content = BooleanQuery()
986         in_meta = BooleanQuery()
987
988         for fld in ['themes_pl', 'content']:
989             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
990
991         for fld in ['tags', 'authors', 'title']:
992             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
993
994         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
995         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
996
997         topDocs = self.searcher.search(q, only_in, max_results)
998         for found in topDocs.scoreDocs:
999             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
1000             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1001
1002         return books
1003
1004     # def multisearch(self, query, max_results=50):
1005     #     """
1006     #     Search strategy:
1007     #     - (phrase) OR -> content
1008     #                   -> title
1009     #                   -> authors
1010     #     - (keywords)  -> authors
1011     #                   -> motyw
1012     #                   -> tags
1013     #                   -> content
1014     #     """
1015         # queryreader = StringReader(query)
1016         # tokens = self.get_tokens(queryreader)
1017
1018         # top_level = BooleanQuery()
1019         # Should = BooleanClause.Occur.SHOULD
1020
1021         # phrase_level = BooleanQuery()
1022         # phrase_level.setBoost(1.3)
1023
1024         # p_content = self.make_phrase(tokens, joined=True)
1025         # p_title = self.make_phrase(tokens, 'title')
1026         # p_author = self.make_phrase(tokens, 'author')
1027
1028         # phrase_level.add(BooleanClause(p_content, Should))
1029         # phrase_level.add(BooleanClause(p_title, Should))
1030         # phrase_level.add(BooleanClause(p_author, Should))
1031
1032         # kw_level = BooleanQuery()
1033
1034         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1035         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1036         # kw_level.add(j_themes, Should)
1037         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1038         # j_con = self.make_term_query(tokens, joined=True)
1039         # kw_level.add(j_con, Should)
1040
1041         # top_level.add(BooleanClause(phrase_level, Should))
1042         # top_level.add(BooleanClause(kw_level, Should))
1043
1044         # return None
1045
1046     def get_snippets(self, scoreDoc, query, field='content'):
1047         """
1048         Returns a snippet for found scoreDoc.
1049         """
1050         htmlFormatter = SimpleHTMLFormatter()
1051         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1052
1053         stored = self.searcher.doc(scoreDoc.doc)
1054
1055         # locate content.
1056         snippets = Snippets(stored.get('book_id')).open()
1057         try:
1058             text = snippets.get((int(stored.get('snippets_position')),
1059                                  int(stored.get('snippets_length'))))
1060         finally:
1061             snippets.close()
1062
1063         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1064         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1065         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1066
1067         return snip
1068
1069     @staticmethod
1070     def enum_to_array(enum):
1071         """
1072         Converts a lucene TermEnum to array of Terms, suitable for
1073         addition to queries
1074         """
1075         terms = []
1076
1077         while True:
1078             t = enum.term()
1079             if t:
1080                 terms.append(t)
1081             if not enum.next(): break
1082
1083         if terms:
1084             return JArray('object')(terms, Term)
1085
1086     def search_tags(self, query, filter=None, max_results=40):
1087         """
1088         Search for Tag objects using query.
1089         """
1090         tops = self.searcher.search(query, filter, max_results)
1091
1092         tags = []
1093         for found in tops.scoreDocs:
1094             doc = self.searcher.doc(found.doc)
1095             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1096             tags.append(tag)
1097             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1098
1099         return tags
1100
1101     def search_books(self, query, filter=None, max_results=10):
1102         """
1103         Searches for Book objects using query
1104         """
1105         bks = []
1106         tops = self.searcher.search(query, filter, max_results)
1107         for found in tops.scoreDocs:
1108             doc = self.searcher.doc(found.doc)
1109             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1110         return bks
1111
1112     def create_prefix_phrase(self, toks, field):
1113         q = MultiPhraseQuery()
1114         for i in range(len(toks)):
1115             t = Term(field, toks[i])
1116             if i == len(toks) - 1:
1117                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1118                 if pterms:
1119                     q.add(pterms)
1120                 else:
1121                     q.add(t)
1122             else:
1123                 q.add(t)
1124         return q
1125
1126     @staticmethod
1127     def term_filter(term, inverse=False):
1128         only_term = TermsFilter()
1129         only_term.addTerm(term)
1130
1131         if inverse:
1132             neg = BooleanFilter()
1133             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1134             only_term = neg
1135
1136         return only_term
1137
1138     def hint_tags(self, string, max_results=50):
1139         """
1140         Return auto-complete hints for tags
1141         using prefix search.
1142         """
1143         toks = self.get_tokens(string, field='SIMPLE')
1144         top = BooleanQuery()
1145
1146         for field in ['tag_name', 'tag_name_pl']:
1147             q = self.create_prefix_phrase(toks, field)
1148             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1149
1150         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1151
1152         return self.search_tags(top, no_book_cat, max_results=max_results)
1153
1154     def hint_books(self, string, max_results=50):
1155         """
1156         Returns auto-complete hints for book titles
1157         Because we do not index 'pseudo' title-tags.
1158         Prefix search.
1159         """
1160         toks = self.get_tokens(string, field='SIMPLE')
1161
1162         q = self.create_prefix_phrase(toks, 'title')
1163
1164         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1165
1166     @staticmethod
1167     def chain_filters(filters, op=ChainedFilter.AND):
1168         """
1169         Chains a filter list together
1170         """
1171         filters = filter(lambda x: x is not None, filters)
1172         if not filters:
1173             return None
1174         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1175         return chf
1176
1177     def filtered_categories(self, tags):
1178         """
1179         Return a list of tag categories, present in tags list.
1180         """
1181         cats = {}
1182         for t in tags:
1183             cats[t.category] = True
1184         return cats.keys()
1185
1186     def hint(self):
1187         return Hint(self)