Some search fixes.
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from multiprocessing.pool import ThreadPool
31 from threading import current_thread
32 import atexit
33 import traceback
34
35
36 class WLAnalyzer(PerFieldAnalyzerWrapper):
37     def __init__(self):
38         polish = PolishAnalyzer(Version.LUCENE_34)
39         #        polish_gap.setPositionIncrementGap(999)
40
41         simple = SimpleAnalyzer(Version.LUCENE_34)
42         #        simple_gap.setPositionIncrementGap(999)
43
44         keyword = KeywordAnalyzer(Version.LUCENE_34)
45
46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
47
48         PerFieldAnalyzerWrapper.__init__(self, polish)
49
50         self.addAnalyzer("tags", simple)
51         self.addAnalyzer("technical_editors", simple)
52         self.addAnalyzer("editors", simple)
53         self.addAnalyzer("url", keyword)
54         self.addAnalyzer("source_url", keyword)
55         self.addAnalyzer("source_name", simple)
56         self.addAnalyzer("publisher", simple)
57         self.addAnalyzer("authors", simple)
58         self.addAnalyzer("title", simple)
59
60         self.addAnalyzer("is_book", keyword)
61         # shouldn't the title have two forms? _pl and simple?
62
63         self.addAnalyzer("themes", simple)
64         self.addAnalyzer("themes_pl", polish)
65
66         self.addAnalyzer("tag_name", simple)
67         self.addAnalyzer("tag_name_pl", polish)
68
69         self.addAnalyzer("translators", simple)
70
71         self.addAnalyzer("KEYWORD", keyword)
72         self.addAnalyzer("SIMPLE", simple)
73         self.addAnalyzer("POLISH", polish)
74
75
76 class IndexStore(object):
77     """
78     Provides access to search index.
79
80     self.store - lucene index directory
81     """
82     def __init__(self):
83         self.make_index_dir()
84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
85
86     def make_index_dir(self):
87         try:
88             os.makedirs(settings.SEARCH_INDEX)
89         except OSError as exc:
90             if exc.errno == errno.EEXIST:
91                 pass
92             else: raise
93
94
95 class IndexChecker(IndexStore):
96     def __init__(self):
97         IndexStore.__init__(self)
98
99     def check(self):
100         checker = CheckIndex(self.store)
101         status = checker.checkIndex()
102         return status
103
104
105 class Snippets(object):
106     """
107     This class manages snippet files for indexed object (book)
108     the snippets are concatenated together, and their positions and
109     lengths are kept in lucene index fields.
110     """
111     SNIPPET_DIR = "snippets"
112
113     def __init__(self, book_id):
114         try:
115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
116         except OSError as exc:
117             if exc.errno == errno.EEXIST:
118                 pass
119             else: raise
120         self.book_id = book_id
121         self.file = None
122
123     def open(self, mode='r'):
124         """
125         Open the snippet file. Call .close() afterwards.
126         """
127         if not 'b' in mode:
128             mode += 'b'
129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
130         self.position = 0
131         return self
132
133     def add(self, snippet):
134         """
135         Append a snippet (unicode) to the snippet file.
136         Return a (position, length) tuple
137         """
138         txt = snippet.encode('utf-8')
139         l = len(txt)
140         self.file.write(txt)
141         pos = (self.position, l)
142         self.position += l
143         return pos
144
145     def get(self, pos):
146         """
147         Given a tuple of (position, length) return an unicode
148         of the snippet stored there.
149         """
150         self.file.seek(pos[0], 0)
151         txt = self.file.read(pos[1]).decode('utf-8')
152         return txt
153
154     def close(self):
155         """Close snippet file"""
156         self.file.close()
157
158
159 class BaseIndex(IndexStore):
160     """
161     Base index class.
162     Provides basic operations on index: opening, closing, optimizing.
163     """
164     def __init__(self, analyzer=None):
165         super(BaseIndex, self).__init__()
166         self.index = None
167         if not analyzer:
168             analyzer = WLAnalyzer()
169         self.analyzer = analyzer
170
171     def open(self, analyzer=None):
172         if self.index:
173             raise Exception("Index is already opened")
174         self.index = IndexWriter(self.store, self.analyzer,\
175                                  IndexWriter.MaxFieldLength.LIMITED)
176         return self.index
177
178     def optimize(self):
179         self.index.optimize()
180
181     def close(self):
182         try:
183             self.index.optimize()
184         except JavaError, je:
185             print "Error during optimize phase, check index: %s" % je
186
187         self.index.close()
188         self.index = None
189
190     def __enter__(self):
191         self.open()
192         return self
193
194     def __exit__(self, type, value, tb):
195         self.close()
196
197
198 class Index(BaseIndex):
199     """
200     Class indexing books.
201     """
202     def __init__(self, analyzer=None):
203         super(Index, self).__init__(analyzer)
204
205     def index_tags(self):
206         """
207         Re-index global tag list.
208         Removes all tags from index, then index them again.
209         Indexed fields include: id, name (with and without polish stems), category
210         """
211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
212         self.index.deleteDocuments(q)
213
214         for tag in catalogue.models.Tag.objects.all():
215             doc = Document()
216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
220             self.index.addDocument(doc)
221
222     def create_book_doc(self, book):
223         """
224         Create a lucene document referring book id.
225         """
226         doc = Document()
227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
228         if book.parent is not None:
229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
230         return doc
231
232     def remove_book(self, book):
233         """Removes a book from search index.
234         book - Book instance."""
235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
236         self.index.deleteDocuments(q)
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             self.remove_book(book)
246
247         book_doc = self.create_book_doc(book)
248         meta_fields = self.extract_metadata(book, book_info)
249         for f in meta_fields.values():
250             if isinstance(f, list) or isinstance(f, tuple):
251                 for elem in f:
252                     book_doc.add(elem)
253             else:
254                 book_doc.add(f)
255
256         self.index.addDocument(book_doc)
257         del book_doc
258
259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
260
261     master_tags = [
262         'opowiadanie',
263         'powiesc',
264         'dramat_wierszowany_l',
265         'dramat_wierszowany_lp',
266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267         'wywiad',
268         ]
269
270     ignore_content_tags = [
271         'uwaga', 'extra',
272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
273         'didaskalia',
274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
275         ]
276
277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
278
279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
280
281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
282
283     def extract_metadata(self, book, book_info=None):
284         """
285         Extract metadata from book and returns a map of fields keyed by fieldname
286         """
287         fields = {}
288
289         if book_info is None:
290             book_info = dcparser.parse(open(book.xml_file.path))
291
292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
295
296         # validator, name
297         for field in dcparser.BookInfo.FIELDS:
298             if hasattr(book_info, field.name):
299                 if not getattr(book_info, field.name):
300                     continue
301                 # since no type information is available, we use validator
302                 type_indicator = field.validator
303                 if type_indicator == dcparser.as_unicode:
304                     s = getattr(book_info, field.name)
305                     if field.multiple:
306                         s = ', '.join(s)
307                     try:
308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
309                     except JavaError as je:
310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
311                 elif type_indicator == dcparser.as_person:
312                     p = getattr(book_info, field.name)
313                     if isinstance(p, dcparser.Person):
314                         persons = unicode(p)
315                     else:
316                         persons = ', '.join(map(unicode, p))
317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
318                 elif type_indicator == dcparser.as_date:
319                     dt = getattr(book_info, field.name)
320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
322
323         # get published date
324         source = book_info.source_name
325         match = self.published_date_re.search(source)
326         print("published date is %s %s" % (match, match is not None and match.groups()))
327         if match is not None:
328             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
329
330         return fields
331
332     def add_gaps(self, fields, fieldname):
333         """
334         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
335         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
336         """
337         def gap():
338             while True:
339                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
340         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
341
342     def get_master(self, root):
343         """
344         Returns the first master tag from an etree.
345         """
346         for master in root.iter():
347             if master.tag in self.master_tags:
348                 return master
349
350     def index_content(self, book, book_fields=[]):
351         """
352         Walks the book XML and extract content from it.
353         Adds parts for each header tag and for each fragment.
354         """
355         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
356         root = wld.edoc.getroot()
357
358         master = self.get_master(root)
359         if master is None:
360             return []
361
362         def walker(node, ignore_tags=[]):
363             yield node, None
364             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
365                 for b, e in walker(child):
366                     yield b, e
367             yield None, node
368             return
369
370         def fix_format(text):
371             #            separator = [u" ", u"\t", u".", u";", u","]
372             if isinstance(text, list):
373                 # need to join it first
374                 text = filter(lambda s: s is not None, content)
375                 text = u' '.join(text)
376                 # for i in range(len(text)):
377                 #     if i > 0:
378                 #         if text[i][0] not in separator\
379                 #             and text[i - 1][-1] not in separator:
380                 #          text.insert(i, u" ")
381
382             return re.sub("(?m)/$", "", text)
383
384         def add_part(snippets, **fields):
385             doc = self.create_book_doc(book)
386             for f in book_fields:
387                 doc.add(f)
388
389             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
390             doc.add(NumericField("header_span", Field.Store.YES, True)\
391                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
392             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
393
394             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
395                           Field.TermVector.WITH_POSITIONS_OFFSETS))
396
397             snip_pos = snippets.add(fields["content"])
398             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
399             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
400
401             if 'fragment_anchor' in fields:
402                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
403                               Field.Store.YES, Field.Index.NOT_ANALYZED))
404
405             if 'themes' in fields:
406                 themes, themes_pl = zip(*[
407                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
408                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
409                      for theme in fields['themes']])
410
411                 themes = self.add_gaps(themes, 'themes')
412                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
413
414                 for t in themes:
415                     doc.add(t)
416                 for t in themes_pl:
417                     doc.add(t)
418
419             return doc
420
421         def give_me_utf8(s):
422             if isinstance(s, unicode):
423                 return s.encode('utf-8')
424             else:
425                 return s
426
427         fragments = {}
428         snippets = Snippets(book.id).open('w')
429         position = 0
430         try:
431             for header in list(master):
432
433                 if header.tag in self.skip_header_tags:
434                     continue
435                 if header.tag is etree.Comment:
436                     continue
437
438                 # section content
439                 content = []
440                 footnote = None
441
442                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
443                     # handle footnotes
444                     if start is not None and start.tag in self.footnote_tags:
445                         footnote = ' '.join(start.itertext())
446                     elif end is not None and footnote is not None and end.tag in self.footnote_tags:
447                         doc = add_part(snippets, header_index=position, header_type=header.tag,
448                                        content=footnote)
449
450                         self.index.addDocument(doc)
451
452                         footnote = None
453
454                     # handle fragments and themes.
455                     if start is not None and start.tag == 'begin':
456                         fid = start.attrib['id'][1:]
457                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
458
459                     elif start is not None and start.tag == 'motyw':
460                         fid = start.attrib['id'][1:]
461                         if start.text is not None:
462                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
463
464                     elif start is not None and start.tag == 'end':
465                         fid = start.attrib['id'][1:]
466                         if fid not in fragments:
467                             continue  # a broken <end> node, skip it
468                                       #                        import pdb; pdb.set_trace()
469                         frag = fragments[fid]
470                         if frag['themes'] == []:
471                             continue  # empty themes list.
472                         del fragments[fid]
473
474                         doc = add_part(snippets,
475                                        header_type=frag['start_header'],
476                                        header_index=frag['start_section'],
477                                        header_span=position - frag['start_section'] + 1,
478                                        fragment_anchor=fid,
479                                        content=fix_format(frag['content']),
480                                        themes=frag['themes'])
481
482                         self.index.addDocument(doc)
483
484                         # Collect content.
485                     elif start is not None:
486                         for frag in fragments.values():
487                             frag['content'].append(start.text)
488                         content.append(start.text)
489                     elif end is not None:
490                         for frag in fragments.values():
491                             frag['content'].append(end.tail)
492                         content.append(end.tail)
493
494                         # in the end, add a section text.
495                 doc = add_part(snippets, header_index=position, header_type=header.tag,
496                                content=fix_format(content))
497
498                 self.index.addDocument(doc)
499                 position += 1
500
501         finally:
502             snippets.close()
503
504
505 def log_exception_wrapper(f):
506     def _wrap(*a):
507         try:
508             f(*a)
509         except Exception, e:
510             print("Error in indexing thread: %s" % e)
511             traceback.print_exc()
512             raise e
513     return _wrap
514
515
516 class ReusableIndex(Index):
517     """
518     Works like index, but does not close/optimize Lucene index
519     until program exit (uses atexit hook).
520     This is usefull for importbooks command.
521
522     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
523     """
524     index = None
525
526     def open(self, analyzer=None, threads=4):
527         if ReusableIndex.index is not None:
528             self.index = ReusableIndex.index
529         else:
530             print("opening index")
531             Index.open(self, analyzer)
532             ReusableIndex.index = self.index
533             atexit.register(ReusableIndex.close_reusable)
534
535     # def index_book(self, *args, **kw):
536     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
537     #     ReusableIndex.pool_jobs.append(job)
538
539     @staticmethod
540     def close_reusable():
541         if ReusableIndex.index is not None:
542             ReusableIndex.index.optimize()
543             ReusableIndex.index.close()
544             ReusableIndex.index = None
545
546     def close(self):
547         pass
548
549
550 class JoinSearch(object):
551     """
552     This mixin could be used to handle block join queries.
553     (currently unused)
554     """
555     def __init__(self, *args, **kw):
556         super(JoinSearch, self).__init__(*args, **kw)
557
558     def wrapjoins(self, query, fields=[]):
559         """
560         This functions modifies the query in a recursive way,
561         so Term and Phrase Queries contained, which match
562         provided fields are wrapped in a BlockJoinQuery,
563         and so delegated to children documents.
564         """
565         if BooleanQuery.instance_(query):
566             qs = BooleanQuery.cast_(query)
567             for clause in qs:
568                 clause = BooleanClause.cast_(clause)
569                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
570             return qs
571         else:
572             termset = HashSet()
573             query.extractTerms(termset)
574             for t in termset:
575                 t = Term.cast_(t)
576                 if t.field() not in fields:
577                     return query
578             return BlockJoinQuery(query, self.parent_filter,
579                                   BlockJoinQuery.ScoreMode.Total)
580
581     def bsearch(self, query, max_results=50):
582         q = self.query(query)
583         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
584
585         tops = self.searcher.search(bjq, max_results)
586         bks = []
587         for found in tops.scoreDocs:
588             doc = self.searcher.doc(found.doc)
589             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
590         return (bks, tops.totalHits)
591
592
593 class SearchResult(object):
594     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
595         if tokens_cache is None: tokens_cache = {}
596
597         if score:
598             self._score = score
599         else:
600             self._score = scoreDocs.score
601
602         self.boost = 1.0
603
604         self._hits = []
605         self._processed_hits = None  # processed hits
606
607         stored = search.searcher.doc(scoreDocs.doc)
608         self.book_id = int(stored.get("book_id"))
609
610         header_type = stored.get("header_type")
611         if not header_type:
612             return
613
614         sec = (header_type, int(stored.get("header_index")))
615         header_span = stored.get('header_span')
616         header_span = header_span is not None and int(header_span) or 1
617
618         fragment = stored.get("fragment_anchor")
619
620         pd = stored.get("published_date")
621         if pd is None:
622             print "published_date is none for book %d" % self.book_id
623             pd = 0
624         self.published_date = int(pd)
625
626         if snippets:
627             snippets = snippets.replace("/\n", "\n")
628         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
629
630         self._hits.append(hit)
631
632         self.search = search
633         self.searched = searched
634         self.tokens_cache = tokens_cache
635
636     @property
637     def score(self):
638         return self._score * self.boost
639
640     def merge(self, other):
641         if self.book_id != other.book_id:
642             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
643         self._hits += other._hits
644         if other.score > self.score:
645             self.score = other.score
646         return self
647
648     def get_book(self):
649         return catalogue.models.Book.objects.get(id=self.book_id)
650
651     book = property(get_book)
652
653     @property
654     def hits(self):
655         if self._processed_hits is not None:
656             return self._processed_hits
657
658         POSITION = 0
659         FRAGMENT = 1
660         POSITION_INDEX = 1
661         POSITION_SPAN = 2
662         SCORE = 2
663         OTHER = 3
664
665         # to sections and fragments
666         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
667         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
668         sect = filter(lambda s: 0 == len(filter(
669             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
670             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
671             frags)), sect)
672
673         hits = []
674
675         # remove duplicate fragments
676         fragments = {}
677         for f in frags:
678             fid = f[FRAGMENT]
679             if fid in fragments:
680                 if fragments[fid][SCORE] >= f[SCORE]:
681                     continue
682             fragments[fid] = f
683         frags = fragments.values()
684
685         # remove duplicate sections
686         sections = {}
687
688         for s in sect:
689             si = s[POSITION][POSITION_INDEX]
690             # skip existing
691             if si in sections:
692                 if sections[si]['score'] >= s[SCORE]:
693                     continue
694
695             m = {'score': s[SCORE],
696                  'section_number': s[POSITION][POSITION_INDEX] + 1,
697                  }
698             m.update(s[OTHER])
699             sections[si] = m
700
701         hits = sections.values()
702
703         for f in frags:
704             try:
705                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
706             except catalogue.models.Fragment.DoesNotExist:
707                 # stale index
708                 continue
709
710             # Figure out if we were searching for a token matching some word in theme name.
711             themes = frag.tags.filter(category='theme')
712             themes_hit = []
713             if self.searched is not None:
714                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
715                 for theme in themes:
716                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
717                     print "THEME HIT: %s in %s" % (tokens, name_tokens)
718                     for t in tokens:
719                         if t in name_tokens:
720                             if not theme in themes_hit:
721                                 themes_hit.append(theme)
722                             break
723
724             m = {'score': f[SCORE],
725                  'fragment': frag,
726                  'section_number': f[POSITION][POSITION_INDEX] + 1,
727                  'themes': themes,
728                  'themes_hit': themes_hit
729                  }
730             m.update(f[OTHER])
731             hits.append(m)
732
733         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
734
735         self._processed_hits = hits
736
737         return hits
738
739     def __unicode__(self):
740         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
741
742     @staticmethod
743     def aggregate(*result_lists):
744         books = {}
745         for rl in result_lists:
746             for r in rl:
747                 if r.book_id in books:
748                     books[r.book_id].merge(r)
749                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
750                 else:
751                     books[r.book_id] = r
752         return books.values()
753
754     def __cmp__(self, other):
755         c = cmp(self.score, other.score)
756         if c == 0:
757             # this is inverted, because earlier date is better
758             return cmp(other.published_date, self.published_date)
759         else:
760             return c
761
762
763 class Hint(object):
764     """
765     Given some hint information (information we already know about)
766     our search target - like author, title (specific book), epoch, genre, kind
767     we can narrow down search using filters.
768     """
769     def __init__(self, search):
770         """
771         Accepts a Searcher instance.
772         """
773         self.search = search
774         self.book_tags = {}
775         self.part_tags = []
776         self._books = []
777
778     def books(self, *books):
779         """
780         Give a hint that we search these books.
781         """
782         self._books = books
783
784     def tags(self, tags):
785         """
786         Give a hint that these Tag objects (a list of)
787         is necessary.
788         """
789         for t in tags:
790             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
791                 lst = self.book_tags.get(t.category, [])
792                 lst.append(t)
793                 self.book_tags[t.category] = lst
794             if t.category in ['theme', 'theme_pl']:
795                 self.part_tags.append(t)
796
797     def tag_filter(self, tags, field='tags'):
798         """
799         Given a lsit of tags and an optional field (but they are normally in tags field)
800         returns a filter accepting only books with specific tags.
801         """
802         q = BooleanQuery()
803
804         for tag in tags:
805             toks = self.search.get_tokens(tag.name, field=field)
806             tag_phrase = PhraseQuery()
807             for tok in toks:
808                 tag_phrase.add(Term(field, tok))
809             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
810
811         return QueryWrapperFilter(q)
812
813     def book_filter(self):
814         """
815         Filters using book tags (all tag kinds except a theme)
816         """
817         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
818         if tags:
819             return self.tag_filter(tags)
820         else:
821             return None
822
823     def part_filter(self):
824         """
825         This filter can be used to look for book parts.
826         It filters on book id and/or themes.
827         """
828         fs = []
829         if self.part_tags:
830             fs.append(self.tag_filter(self.part_tags, field='themes'))
831
832         if self._books != []:
833             bf = BooleanFilter()
834             for b in self._books:
835                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
836                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
837             fs.append(bf)
838
839         return Search.chain_filters(fs)
840
841     def should_search_for_book(self):
842         return self._books == []
843
844     def just_search_in(self, all):
845         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
846         some = []
847         for field in all:
848             if field == 'authors' and 'author' in self.book_tags:
849                 continue
850             if field == 'title' and self._books != []:
851                 continue
852             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
853                 continue
854             some.append(field)
855         return some
856
857
858 class Search(IndexStore):
859     """
860     Search facilities.
861     """
862     def __init__(self, default_field="content"):
863         IndexStore.__init__(self)
864         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
865         # self.analyzer = WLAnalyzer()
866         self.searcher = IndexSearcher(self.store, True)
867         self.parser = QueryParser(Version.LUCENE_34, default_field,
868                                   self.analyzer)
869
870         self.parent_filter = TermsFilter()
871         self.parent_filter.addTerm(Term("is_book", "true"))
872
873     def query(self, query):
874         """Parse query in default Lucene Syntax. (for humans)
875         """
876         return self.parser.parse(query)
877
878     def simple_search(self, query, max_results=50):
879         """Runs a query for books using lucene syntax. (for humans)
880         Returns (books, total_hits)
881         """
882
883         tops = self.searcher.search(self.query(query), max_results)
884         bks = []
885         for found in tops.scoreDocs:
886             doc = self.searcher.doc(found.doc)
887             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
888         return (bks, tops.totalHits)
889
890     def get_tokens(self, searched, field='content', cached=None):
891         """returns tokens analyzed by a proper (for a field) analyzer
892         argument can be: StringReader, string/unicode, or tokens. In the last case
893         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
894         """
895         if cached is not None and field in cached:
896             return cached[field]
897
898         if isinstance(searched, str) or isinstance(searched, unicode):
899             searched = StringReader(searched)
900         elif isinstance(searched, list):
901             return searched
902
903         searched.reset()
904         tokens = self.analyzer.reusableTokenStream(field, searched)
905         toks = []
906         while tokens.incrementToken():
907             cta = tokens.getAttribute(CharTermAttribute.class_)
908             toks.append(cta.toString())
909
910         if cached is not None:
911             cached[field] = toks
912
913         return toks
914
915     def fuzziness(self, fuzzy):
916         """Helper method to sanitize fuzziness"""
917         if not fuzzy:
918             return None
919         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
920             return fuzzy
921         else:
922             return 0.5
923
924     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
925         """
926         Return a PhraseQuery with a series of tokens.
927         """
928         if fuzzy:
929             phrase = MultiPhraseQuery()
930             for t in tokens:
931                 term = Term(field, t)
932                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
933                 fuzzterms = []
934
935                 while True:
936                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
937                     ft = fuzzterm.term()
938                     if ft:
939                         fuzzterms.append(ft)
940                     if not fuzzterm.next(): break
941                 if fuzzterms:
942                     phrase.add(JArray('object')(fuzzterms, Term))
943                 else:
944                     phrase.add(term)
945         else:
946             phrase = PhraseQuery()
947             phrase.setSlop(slop)
948             for t in tokens:
949                 term = Term(field, t)
950                 phrase.add(term)
951         return phrase
952
953     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
954         """
955         Returns term queries joined by boolean query.
956         modal - applies to boolean query
957         fuzzy - should the query by fuzzy.
958         """
959         q = BooleanQuery()
960         for t in tokens:
961             term = Term(field, t)
962             if fuzzy:
963                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
964             else:
965                 term = TermQuery(term)
966             q.add(BooleanClause(term, modal))
967         return q
968
969     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
970                       filters=None, tokens_cache=None, boost=None, snippets=False):
971         if filters is None: filters = []
972         if tokens_cache is None: tokens_cache = {}
973
974         tokens = self.get_tokens(searched, field, cached=tokens_cache)
975
976         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy)
977         if book:
978             filters.append(self.term_filter(Term('is_book', 'true')))
979         top = self.searcher.search(query, self.chain_filters(filters), max_results)
980
981         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
982
983     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
984                     filters=None, tokens_cache=None, boost=None):
985         if filters is None: filters = []
986         if tokens_cache is None: tokens_cache = {}
987
988         if book:
989             filters.append(self.term_filter(Term('is_book', 'true')))
990
991         query = BooleanQuery()
992
993         for fld in fields:
994             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
995
996             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
997                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
998
999         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1000
1001         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1002                              snippets=self.get_snippets(found, query)) for found in top.scoreDocs]
1003
1004     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1005         """
1006         Search for perfect book matches. Just see if the query matches with some author or title,
1007         taking hints into account.
1008         """
1009         fields_to_search = ['authors', 'title']
1010         only_in = None
1011         if hint:
1012             if not hint.should_search_for_book():
1013                 return []
1014             fields_to_search = hint.just_search_in(fields_to_search)
1015             only_in = hint.book_filter()
1016
1017         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1018
1019         books = []
1020         for q in qrys:
1021             top = self.searcher.search(q,
1022                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1023                 max_results)
1024             for found in top.scoreDocs:
1025                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1026         return books
1027
1028     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1029         fields_to_search = ['tags', 'authors', 'title']
1030
1031         only_in = None
1032         if hint:
1033             if not hint.should_search_for_book():
1034                 return []
1035             fields_to_search = hint.just_search_in(fields_to_search)
1036             only_in = hint.book_filter()
1037
1038         tokens = self.get_tokens(searched, field='SIMPLE')
1039
1040         q = BooleanQuery()
1041
1042         for fld in fields_to_search:
1043             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1044                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1045
1046         books = []
1047         top = self.searcher.search(q,
1048                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1049             max_results)
1050         for found in top.scoreDocs:
1051             books.append(SearchResult(self, found, how_found="search_book"))
1052
1053         return books
1054
1055     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1056         """
1057         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1058         some part/fragment of the book.
1059         """
1060         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1061
1062         flt = None
1063         if hint:
1064             flt = hint.part_filter()
1065
1066         books = []
1067         for q in qrys:
1068             top = self.searcher.search(q,
1069                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1070                                                            flt]),
1071                                        max_results)
1072             for found in top.scoreDocs:
1073                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1074
1075         return books
1076
1077     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1078         """
1079         Tries to use search terms to match different fields of book (or its parts).
1080         E.g. one word can be an author survey, another be a part of the title, and the rest
1081         are some words from third chapter.
1082         """
1083         if tokens_cache is None: tokens_cache = {}
1084         books = []
1085         only_in = None
1086
1087         if hint:
1088             only_in = hint.part_filter()
1089
1090         # content only query : themes x content
1091         q = BooleanQuery()
1092
1093         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1094         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1095
1096         # only search in themes when we do not already filter by themes
1097         if hint is None or hint.just_search_in(['themes']) != []:
1098             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1099                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1100
1101         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1102                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1103
1104         topDocs = self.searcher.search(q, only_in, max_results)
1105         for found in topDocs.scoreDocs:
1106             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1107             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1108
1109         # query themes/content x author/title/tags
1110         q = BooleanQuery()
1111         in_content = BooleanQuery()
1112         in_meta = BooleanQuery()
1113
1114         for fld in ['themes_pl', 'content']:
1115             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1116
1117         for fld in ['tags', 'authors', 'title']:
1118             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1119
1120         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1121         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1122
1123         topDocs = self.searcher.search(q, only_in, max_results)
1124         for found in topDocs.scoreDocs:
1125             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1126             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1127
1128         return books
1129
1130     # def multisearch(self, query, max_results=50):
1131     #     """
1132     #     Search strategy:
1133     #     - (phrase) OR -> content
1134     #                   -> title
1135     #                   -> authors
1136     #     - (keywords)  -> authors
1137     #                   -> motyw
1138     #                   -> tags
1139     #                   -> content
1140     #     """
1141         # queryreader = StringReader(query)
1142         # tokens = self.get_tokens(queryreader)
1143
1144         # top_level = BooleanQuery()
1145         # Should = BooleanClause.Occur.SHOULD
1146
1147         # phrase_level = BooleanQuery()
1148         # phrase_level.setBoost(1.3)
1149
1150         # p_content = self.make_phrase(tokens, joined=True)
1151         # p_title = self.make_phrase(tokens, 'title')
1152         # p_author = self.make_phrase(tokens, 'author')
1153
1154         # phrase_level.add(BooleanClause(p_content, Should))
1155         # phrase_level.add(BooleanClause(p_title, Should))
1156         # phrase_level.add(BooleanClause(p_author, Should))
1157
1158         # kw_level = BooleanQuery()
1159
1160         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1161         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1162         # kw_level.add(j_themes, Should)
1163         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1164         # j_con = self.make_term_query(tokens, joined=True)
1165         # kw_level.add(j_con, Should)
1166
1167         # top_level.add(BooleanClause(phrase_level, Should))
1168         # top_level.add(BooleanClause(kw_level, Should))
1169
1170         # return None
1171
1172     def get_snippets(self, scoreDoc, query, field='content'):
1173         """
1174         Returns a snippet for found scoreDoc.
1175         """
1176         htmlFormatter = SimpleHTMLFormatter()
1177         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1178
1179         stored = self.searcher.doc(scoreDoc.doc)
1180
1181         position = stored.get('snippets_position')
1182         length = stored.get('snippets_length')
1183         if position is None or length is None:
1184             return None
1185         # locate content.
1186         snippets = Snippets(stored.get('book_id')).open()
1187         try:
1188             text = snippets.get((int(position),
1189                                  int(length)))
1190         finally:
1191             snippets.close()
1192
1193         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1194         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1195         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1196
1197         return snip
1198
1199     @staticmethod
1200     def enum_to_array(enum):
1201         """
1202         Converts a lucene TermEnum to array of Terms, suitable for
1203         addition to queries
1204         """
1205         terms = []
1206
1207         while True:
1208             t = enum.term()
1209             if t:
1210                 terms.append(t)
1211             if not enum.next(): break
1212
1213         if terms:
1214             return JArray('object')(terms, Term)
1215
1216     def search_tags(self, query, filter=None, max_results=40):
1217         """
1218         Search for Tag objects using query.
1219         """
1220         tops = self.searcher.search(query, filter, max_results)
1221
1222         tags = []
1223         for found in tops.scoreDocs:
1224             doc = self.searcher.doc(found.doc)
1225             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1226             tags.append(tag)
1227             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1228
1229         return tags
1230
1231     def search_books(self, query, filter=None, max_results=10):
1232         """
1233         Searches for Book objects using query
1234         """
1235         bks = []
1236         tops = self.searcher.search(query, filter, max_results)
1237         for found in tops.scoreDocs:
1238             doc = self.searcher.doc(found.doc)
1239             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1240         return bks
1241
1242     def create_prefix_phrase(self, toks, field):
1243         q = MultiPhraseQuery()
1244         for i in range(len(toks)):
1245             t = Term(field, toks[i])
1246             if i == len(toks) - 1:
1247                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1248                 if pterms:
1249                     q.add(pterms)
1250                 else:
1251                     q.add(t)
1252             else:
1253                 q.add(t)
1254         return q
1255
1256     @staticmethod
1257     def term_filter(term, inverse=False):
1258         only_term = TermsFilter()
1259         only_term.addTerm(term)
1260
1261         if inverse:
1262             neg = BooleanFilter()
1263             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1264             only_term = neg
1265
1266         return only_term
1267
1268     def hint_tags(self, string, max_results=50):
1269         """
1270         Return auto-complete hints for tags
1271         using prefix search.
1272         """
1273         toks = self.get_tokens(string, field='SIMPLE')
1274         top = BooleanQuery()
1275
1276         for field in ['tag_name', 'tag_name_pl']:
1277             q = self.create_prefix_phrase(toks, field)
1278             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1279
1280         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1281
1282         return self.search_tags(top, no_book_cat, max_results=max_results)
1283
1284     def hint_books(self, string, max_results=50):
1285         """
1286         Returns auto-complete hints for book titles
1287         Because we do not index 'pseudo' title-tags.
1288         Prefix search.
1289         """
1290         toks = self.get_tokens(string, field='SIMPLE')
1291
1292         q = self.create_prefix_phrase(toks, 'title')
1293
1294         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1295
1296     @staticmethod
1297     def chain_filters(filters, op=ChainedFilter.AND):
1298         """
1299         Chains a filter list together
1300         """
1301         filters = filter(lambda x: x is not None, filters)
1302         if not filters or filters is []:
1303             return None
1304         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1305         return chf
1306
1307     def filtered_categories(self, tags):
1308         """
1309         Return a list of tag categories, present in tags list.
1310         """
1311         cats = {}
1312         for t in tags:
1313             cats[t.category] = True
1314         return cats.keys()
1315
1316     def hint(self):
1317         return Hint(self)