Fix section counting/targetting from search snippet
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from multiprocessing.pool import ThreadPool
31 from threading import current_thread
32 import atexit
33 import traceback
34
35
36 class WLAnalyzer(PerFieldAnalyzerWrapper):
37     def __init__(self):
38         polish = PolishAnalyzer(Version.LUCENE_34)
39         #        polish_gap.setPositionIncrementGap(999)
40
41         simple = SimpleAnalyzer(Version.LUCENE_34)
42         #        simple_gap.setPositionIncrementGap(999)
43
44         keyword = KeywordAnalyzer(Version.LUCENE_34)
45
46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
47
48         PerFieldAnalyzerWrapper.__init__(self, polish)
49
50         self.addAnalyzer("tags", simple)
51         self.addAnalyzer("technical_editors", simple)
52         self.addAnalyzer("editors", simple)
53         self.addAnalyzer("url", keyword)
54         self.addAnalyzer("source_url", keyword)
55         self.addAnalyzer("source_name", simple)
56         self.addAnalyzer("publisher", simple)
57         self.addAnalyzer("authors", simple)
58         self.addAnalyzer("title", simple)
59
60         self.addAnalyzer("is_book", keyword)
61         # shouldn't the title have two forms? _pl and simple?
62
63         self.addAnalyzer("themes", simple)
64         self.addAnalyzer("themes_pl", polish)
65
66         self.addAnalyzer("tag_name", simple)
67         self.addAnalyzer("tag_name_pl", polish)
68
69         self.addAnalyzer("translators", simple)
70
71         self.addAnalyzer("KEYWORD", keyword)
72         self.addAnalyzer("SIMPLE", simple)
73         self.addAnalyzer("POLISH", polish)
74
75
76 class IndexStore(object):
77     """
78     Provides access to search index.
79
80     self.store - lucene index directory
81     """
82     def __init__(self):
83         self.make_index_dir()
84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
85
86     def make_index_dir(self):
87         try:
88             os.makedirs(settings.SEARCH_INDEX)
89         except OSError as exc:
90             if exc.errno == errno.EEXIST:
91                 pass
92             else: raise
93
94
95 class IndexChecker(IndexStore):
96     def __init__(self):
97         IndexStore.__init__(self)
98
99     def check(self):
100         checker = CheckIndex(self.store)
101         status = checker.checkIndex()
102         return status
103
104
105 class Snippets(object):
106     """
107     This class manages snippet files for indexed object (book)
108     the snippets are concatenated together, and their positions and
109     lengths are kept in lucene index fields.
110     """
111     SNIPPET_DIR = "snippets"
112
113     def __init__(self, book_id):
114         try:
115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
116         except OSError as exc:
117             if exc.errno == errno.EEXIST:
118                 pass
119             else: raise
120         self.book_id = book_id
121         self.file = None
122
123     def open(self, mode='r'):
124         """
125         Open the snippet file. Call .close() afterwards.
126         """
127         if not 'b' in mode:
128             mode += 'b'
129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
130         self.position = 0
131         return self
132
133     def add(self, snippet):
134         """
135         Append a snippet (unicode) to the snippet file.
136         Return a (position, length) tuple
137         """
138         txt = snippet.encode('utf-8')
139         l = len(txt)
140         self.file.write(txt)
141         pos = (self.position, l)
142         self.position += l
143         return pos
144
145     def get(self, pos):
146         """
147         Given a tuple of (position, length) return an unicode
148         of the snippet stored there.
149         """
150         self.file.seek(pos[0], 0)
151         txt = self.file.read(pos[1]).decode('utf-8')
152         return txt
153
154     def close(self):
155         """Close snippet file"""
156         self.file.close()
157
158
159 class BaseIndex(IndexStore):
160     """
161     Base index class.
162     Provides basic operations on index: opening, closing, optimizing.
163     """
164     def __init__(self, analyzer=None):
165         super(BaseIndex, self).__init__()
166         self.index = None
167         if not analyzer:
168             analyzer = WLAnalyzer()
169         self.analyzer = analyzer
170
171     def open(self, analyzer=None):
172         if self.index:
173             raise Exception("Index is already opened")
174         self.index = IndexWriter(self.store, self.analyzer,\
175                                  IndexWriter.MaxFieldLength.LIMITED)
176         return self.index
177
178     def optimize(self):
179         self.index.optimize()
180
181     def close(self):
182         try:
183             self.index.optimize()
184         except JavaError, je:
185             print "Error during optimize phase, check index: %s" % je
186
187         self.index.close()
188         self.index = None
189
190     def __enter__(self):
191         self.open()
192         return self
193
194     def __exit__(self, type, value, tb):
195         self.close()
196
197
198 class Index(BaseIndex):
199     """
200     Class indexing books.
201     """
202     def __init__(self, analyzer=None):
203         super(Index, self).__init__(analyzer)
204
205     def index_tags(self):
206         """
207         Re-index global tag list.
208         Removes all tags from index, then index them again.
209         Indexed fields include: id, name (with and without polish stems), category
210         """
211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
212         self.index.deleteDocuments(q)
213
214         for tag in catalogue.models.Tag.objects.all():
215             doc = Document()
216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
220             self.index.addDocument(doc)
221
222     def create_book_doc(self, book):
223         """
224         Create a lucene document referring book id.
225         """
226         doc = Document()
227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
228         if book.parent is not None:
229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
230         return doc
231
232     def remove_book(self, book):
233         """Removes a book from search index.
234         book - Book instance."""
235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
236         self.index.deleteDocuments(q)
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             self.remove_book(book)
246
247         book_doc = self.create_book_doc(book)
248         meta_fields = self.extract_metadata(book, book_info)
249         for f in meta_fields.values():
250             if isinstance(f, list) or isinstance(f, tuple):
251                 for elem in f:
252                     book_doc.add(elem)
253             else:
254                 book_doc.add(f)
255
256         self.index.addDocument(book_doc)
257         del book_doc
258
259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
260
261     master_tags = [
262         'opowiadanie',
263         'powiesc',
264         'dramat_wierszowany_l',
265         'dramat_wierszowany_lp',
266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267         'wywiad',
268         ]
269
270     ignore_content_tags = [
271         'uwaga', 'extra',
272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
273         'didaskalia',
274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
275         ]
276
277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
278
279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
280
281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
282
283     def extract_metadata(self, book, book_info=None):
284         """
285         Extract metadata from book and returns a map of fields keyed by fieldname
286         """
287         fields = {}
288
289         if book_info is None:
290             book_info = dcparser.parse(open(book.xml_file.path))
291
292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
295
296         # validator, name
297         for field in dcparser.BookInfo.FIELDS:
298             if hasattr(book_info, field.name):
299                 if not getattr(book_info, field.name):
300                     continue
301                 # since no type information is available, we use validator
302                 type_indicator = field.validator
303                 if type_indicator == dcparser.as_unicode:
304                     s = getattr(book_info, field.name)
305                     if field.multiple:
306                         s = ', '.join(s)
307                     try:
308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
309                     except JavaError as je:
310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
311                 elif type_indicator == dcparser.as_person:
312                     p = getattr(book_info, field.name)
313                     if isinstance(p, dcparser.Person):
314                         persons = unicode(p)
315                     else:
316                         persons = ', '.join(map(unicode, p))
317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
318                 elif type_indicator == dcparser.as_date:
319                     dt = getattr(book_info, field.name)
320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
322
323         # get published date
324         source = book_info.source_name
325         match = self.published_date_re.search(source)
326         if match is not None:
327             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
328
329         return fields
330
331     def add_gaps(self, fields, fieldname):
332         """
333         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
334         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
335         """
336         def gap():
337             while True:
338                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
339         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
340
341     def get_master(self, root):
342         """
343         Returns the first master tag from an etree.
344         """
345         for master in root.iter():
346             if master.tag in self.master_tags:
347                 return master
348
349     def index_content(self, book, book_fields=[]):
350         """
351         Walks the book XML and extract content from it.
352         Adds parts for each header tag and for each fragment.
353         """
354         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
355         root = wld.edoc.getroot()
356
357         master = self.get_master(root)
358         if master is None:
359             return []
360
361         def walker(node, ignore_tags=[]):
362             yield node, None
363             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
364                 for b, e in walker(child):
365                     yield b, e
366             yield None, node
367             return
368
369         def fix_format(text):
370             #            separator = [u" ", u"\t", u".", u";", u","]
371             if isinstance(text, list):
372                 # need to join it first
373                 text = filter(lambda s: s is not None, content)
374                 text = u' '.join(text)
375                 # for i in range(len(text)):
376                 #     if i > 0:
377                 #         if text[i][0] not in separator\
378                 #             and text[i - 1][-1] not in separator:
379                 #          text.insert(i, u" ")
380
381             return re.sub("(?m)/$", "", text)
382
383         def add_part(snippets, **fields):
384             doc = self.create_book_doc(book)
385             for f in book_fields:
386                 doc.add(f)
387
388             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
389             doc.add(NumericField("header_span", Field.Store.YES, True)\
390                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
391             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
392
393             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
394                           Field.TermVector.WITH_POSITIONS_OFFSETS))
395
396             snip_pos = snippets.add(fields["content"])
397             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
398             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
399
400             if 'fragment_anchor' in fields:
401                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
402                               Field.Store.YES, Field.Index.NOT_ANALYZED))
403
404             if 'themes' in fields:
405                 themes, themes_pl = zip(*[
406                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
407                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
408                      for theme in fields['themes']])
409
410                 themes = self.add_gaps(themes, 'themes')
411                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
412
413                 for t in themes:
414                     doc.add(t)
415                 for t in themes_pl:
416                     doc.add(t)
417
418             return doc
419
420         def give_me_utf8(s):
421             if isinstance(s, unicode):
422                 return s.encode('utf-8')
423             else:
424                 return s
425
426         fragments = {}
427         snippets = Snippets(book.id).open('w')
428         try:
429             for header, position in zip(list(master), range(len(master))):
430
431                 if header.tag in self.skip_header_tags:
432                     continue
433                 if header.tag is etree.Comment:
434                     continue
435
436                 # section content
437                 content = []
438                 footnote = None
439
440                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
441                     # handle footnotes
442                     # if start is not None and start.tag in self.footnote_tags:
443                     #     footnote = ' '.join(start.itertext())
444                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
445                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
446                     #                    content=footnote)
447
448                     #     self.index.addDocument(doc)
449
450                     #     footnote = None
451
452                     # handle fragments and themes.
453                     if start is not None and start.tag == 'begin':
454                         fid = start.attrib['id'][1:]
455                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
456
457                     elif start is not None and start.tag == 'motyw':
458                         fid = start.attrib['id'][1:]
459                         if start.text is not None:
460                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
461
462                     elif start is not None and start.tag == 'end':
463                         fid = start.attrib['id'][1:]
464                         if fid not in fragments:
465                             continue  # a broken <end> node, skip it
466                                       #                        import pdb; pdb.set_trace()
467                         frag = fragments[fid]
468                         if frag['themes'] == []:
469                             continue  # empty themes list.
470                         del fragments[fid]
471
472                         doc = add_part(snippets,
473                                        header_type=frag['start_header'],
474                                        header_index=frag['start_section'],
475                                        header_span=position - frag['start_section'] + 1,
476                                        fragment_anchor=fid,
477                                        content=fix_format(frag['content']),
478                                        themes=frag['themes'])
479
480                         self.index.addDocument(doc)
481
482                         # Collect content.
483                     elif start is not None:
484                         for frag in fragments.values():
485                             frag['content'].append(start.text)
486                         content.append(start.text)
487                     elif end is not None:
488                         for frag in fragments.values():
489                             frag['content'].append(end.tail)
490                         content.append(end.tail)
491
492                         # in the end, add a section text.
493                 doc = add_part(snippets, header_index=position, header_type=header.tag,
494                                content=fix_format(content))
495
496                 self.index.addDocument(doc)
497
498         finally:
499             snippets.close()
500
501
502 def log_exception_wrapper(f):
503     def _wrap(*a):
504         try:
505             f(*a)
506         except Exception, e:
507             print("Error in indexing thread: %s" % e)
508             traceback.print_exc()
509             raise e
510     return _wrap
511
512
513 class ReusableIndex(Index):
514     """
515     Works like index, but does not close/optimize Lucene index
516     until program exit (uses atexit hook).
517     This is usefull for importbooks command.
518
519     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
520     """
521     index = None
522
523     def open(self, analyzer=None, threads=4):
524         if ReusableIndex.index is not None:
525             self.index = ReusableIndex.index
526         else:
527             print("opening index")
528             Index.open(self, analyzer)
529             ReusableIndex.index = self.index
530             atexit.register(ReusableIndex.close_reusable)
531
532     # def index_book(self, *args, **kw):
533     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
534     #     ReusableIndex.pool_jobs.append(job)
535
536     @staticmethod
537     def close_reusable():
538         if ReusableIndex.index is not None:
539             ReusableIndex.index.optimize()
540             ReusableIndex.index.close()
541             ReusableIndex.index = None
542
543     def close(self):
544         pass
545
546
547 class JoinSearch(object):
548     """
549     This mixin could be used to handle block join queries.
550     (currently unused)
551     """
552     def __init__(self, *args, **kw):
553         super(JoinSearch, self).__init__(*args, **kw)
554
555     def wrapjoins(self, query, fields=[]):
556         """
557         This functions modifies the query in a recursive way,
558         so Term and Phrase Queries contained, which match
559         provided fields are wrapped in a BlockJoinQuery,
560         and so delegated to children documents.
561         """
562         if BooleanQuery.instance_(query):
563             qs = BooleanQuery.cast_(query)
564             for clause in qs:
565                 clause = BooleanClause.cast_(clause)
566                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
567             return qs
568         else:
569             termset = HashSet()
570             query.extractTerms(termset)
571             for t in termset:
572                 t = Term.cast_(t)
573                 if t.field() not in fields:
574                     return query
575             return BlockJoinQuery(query, self.parent_filter,
576                                   BlockJoinQuery.ScoreMode.Total)
577
578     def bsearch(self, query, max_results=50):
579         q = self.query(query)
580         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
581
582         tops = self.searcher.search(bjq, max_results)
583         bks = []
584         for found in tops.scoreDocs:
585             doc = self.searcher.doc(found.doc)
586             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
587         return (bks, tops.totalHits)
588
589
590 class SearchResult(object):
591     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
592         if tokens_cache is None: tokens_cache = {}
593
594         if score:
595             self._score = score
596         else:
597             self._score = scoreDocs.score
598
599         self.boost = 1.0
600
601         self._hits = []
602         self._processed_hits = None  # processed hits
603
604         stored = search.searcher.doc(scoreDocs.doc)
605         self.book_id = int(stored.get("book_id"))
606
607         header_type = stored.get("header_type")
608         if not header_type:
609             return
610
611         sec = (header_type, int(stored.get("header_index")))
612         header_span = stored.get('header_span')
613         header_span = header_span is not None and int(header_span) or 1
614
615         fragment = stored.get("fragment_anchor")
616
617         pd = stored.get("published_date")
618         if pd is None:
619             pd = 0
620         self.published_date = int(pd)
621
622         if snippets:
623             snippets = snippets.replace("/\n", "\n")
624         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
625
626         self._hits.append(hit)
627
628         self.search = search
629         self.searched = searched
630         self.tokens_cache = tokens_cache
631
632     @property
633     def score(self):
634         return self._score * self.boost
635
636     def merge(self, other):
637         if self.book_id != other.book_id:
638             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
639         self._hits += other._hits
640         if other.score > self.score:
641             self._score = other._score
642         return self
643
644     def get_book(self):
645         return catalogue.models.Book.objects.get(id=self.book_id)
646
647     book = property(get_book)
648
649     @property
650     def hits(self):
651         if self._processed_hits is not None:
652             return self._processed_hits
653
654         POSITION = 0
655         FRAGMENT = 1
656         POSITION_INDEX = 1
657         POSITION_SPAN = 2
658         SCORE = 2
659         OTHER = 3
660
661         # to sections and fragments
662         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
663         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
664         sect = filter(lambda s: 0 == len(filter(
665             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
666             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
667             frags)), sect)
668
669         hits = []
670
671         # remove duplicate fragments
672         fragments = {}
673         for f in frags:
674             fid = f[FRAGMENT]
675             if fid in fragments:
676                 if fragments[fid][SCORE] >= f[SCORE]:
677                     continue
678             fragments[fid] = f
679         frags = fragments.values()
680
681         # remove duplicate sections
682         sections = {}
683
684         for s in sect:
685             si = s[POSITION][POSITION_INDEX]
686             # skip existing
687             if si in sections:
688                 if sections[si]['score'] >= s[SCORE]:
689                     continue
690
691             m = {'score': s[SCORE],
692                  'section_number': s[POSITION][POSITION_INDEX] + 1,
693                  }
694             m.update(s[OTHER])
695             sections[si] = m
696
697         hits = sections.values()
698
699         for f in frags:
700             try:
701                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
702             except catalogue.models.Fragment.DoesNotExist:
703                 # stale index
704                 continue
705
706             # Figure out if we were searching for a token matching some word in theme name.
707             themes = frag.tags.filter(category='theme')
708             themes_hit = []
709             if self.searched is not None:
710                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
711                 for theme in themes:
712                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
713                     for t in tokens:
714                         if t in name_tokens:
715                             if not theme in themes_hit:
716                                 themes_hit.append(theme)
717                             break
718
719             m = {'score': f[SCORE],
720                  'fragment': frag,
721                  'section_number': f[POSITION][POSITION_INDEX] + 1,
722                  'themes': themes,
723                  'themes_hit': themes_hit
724                  }
725             m.update(f[OTHER])
726             hits.append(m)
727
728         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
729
730         self._processed_hits = hits
731
732         return hits
733
734     def __unicode__(self):
735         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
736
737     @staticmethod
738     def aggregate(*result_lists):
739         books = {}
740         for rl in result_lists:
741             for r in rl:
742                 if r.book_id in books:
743                     books[r.book_id].merge(r)
744                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
745                 else:
746                     books[r.book_id] = r
747         return books.values()
748
749     def __cmp__(self, other):
750         c = cmp(self.score, other.score)
751         if c == 0:
752             # this is inverted, because earlier date is better
753             return cmp(other.published_date, self.published_date)
754         else:
755             return c
756
757
758 class Hint(object):
759     """
760     Given some hint information (information we already know about)
761     our search target - like author, title (specific book), epoch, genre, kind
762     we can narrow down search using filters.
763     """
764     def __init__(self, search):
765         """
766         Accepts a Searcher instance.
767         """
768         self.search = search
769         self.book_tags = {}
770         self.part_tags = []
771         self._books = []
772
773     def books(self, *books):
774         """
775         Give a hint that we search these books.
776         """
777         self._books = books
778
779     def tags(self, tags):
780         """
781         Give a hint that these Tag objects (a list of)
782         is necessary.
783         """
784         for t in tags:
785             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
786                 lst = self.book_tags.get(t.category, [])
787                 lst.append(t)
788                 self.book_tags[t.category] = lst
789             if t.category in ['theme', 'theme_pl']:
790                 self.part_tags.append(t)
791
792     def tag_filter(self, tags, field='tags'):
793         """
794         Given a lsit of tags and an optional field (but they are normally in tags field)
795         returns a filter accepting only books with specific tags.
796         """
797         q = BooleanQuery()
798
799         for tag in tags:
800             toks = self.search.get_tokens(tag.name, field=field)
801             tag_phrase = PhraseQuery()
802             for tok in toks:
803                 tag_phrase.add(Term(field, tok))
804             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
805
806         return QueryWrapperFilter(q)
807
808     def book_filter(self):
809         """
810         Filters using book tags (all tag kinds except a theme)
811         """
812         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
813         if tags:
814             return self.tag_filter(tags)
815         else:
816             return None
817
818     def part_filter(self):
819         """
820         This filter can be used to look for book parts.
821         It filters on book id and/or themes.
822         """
823         fs = []
824         if self.part_tags:
825             fs.append(self.tag_filter(self.part_tags, field='themes'))
826
827         if self._books != []:
828             bf = BooleanFilter()
829             for b in self._books:
830                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
831                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
832             fs.append(bf)
833
834         return Search.chain_filters(fs)
835
836     def should_search_for_book(self):
837         return self._books == []
838
839     def just_search_in(self, all):
840         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
841         some = []
842         for field in all:
843             if field == 'authors' and 'author' in self.book_tags:
844                 continue
845             if field == 'title' and self._books != []:
846                 continue
847             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
848                 continue
849             some.append(field)
850         return some
851
852
853 class Search(IndexStore):
854     """
855     Search facilities.
856     """
857     def __init__(self, default_field="content"):
858         IndexStore.__init__(self)
859         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
860         # self.analyzer = WLAnalyzer()
861         self.searcher = IndexSearcher(self.store, True)
862         self.parser = QueryParser(Version.LUCENE_34, default_field,
863                                   self.analyzer)
864
865         self.parent_filter = TermsFilter()
866         self.parent_filter.addTerm(Term("is_book", "true"))
867
868     def query(self, query):
869         """Parse query in default Lucene Syntax. (for humans)
870         """
871         return self.parser.parse(query)
872
873     def simple_search(self, query, max_results=50):
874         """Runs a query for books using lucene syntax. (for humans)
875         Returns (books, total_hits)
876         """
877
878         tops = self.searcher.search(self.query(query), max_results)
879         bks = []
880         for found in tops.scoreDocs:
881             doc = self.searcher.doc(found.doc)
882             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
883         return (bks, tops.totalHits)
884
885     def get_tokens(self, searched, field='content', cached=None):
886         """returns tokens analyzed by a proper (for a field) analyzer
887         argument can be: StringReader, string/unicode, or tokens. In the last case
888         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
889         """
890         if cached is not None and field in cached:
891             return cached[field]
892
893         if isinstance(searched, str) or isinstance(searched, unicode):
894             searched = StringReader(searched)
895         elif isinstance(searched, list):
896             return searched
897
898         searched.reset()
899         tokens = self.analyzer.reusableTokenStream(field, searched)
900         toks = []
901         while tokens.incrementToken():
902             cta = tokens.getAttribute(CharTermAttribute.class_)
903             toks.append(cta.toString())
904
905         if cached is not None:
906             cached[field] = toks
907
908         return toks
909
910     def fuzziness(self, fuzzy):
911         """Helper method to sanitize fuzziness"""
912         if not fuzzy:
913             return None
914         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
915             return fuzzy
916         else:
917             return 0.5
918
919     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
920         """
921         Return a PhraseQuery with a series of tokens.
922         """
923         if fuzzy:
924             phrase = MultiPhraseQuery()
925             for t in tokens:
926                 term = Term(field, t)
927                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
928                 fuzzterms = []
929
930                 while True:
931                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
932                     ft = fuzzterm.term()
933                     if ft:
934                         fuzzterms.append(ft)
935                     if not fuzzterm.next(): break
936                 if fuzzterms:
937                     phrase.add(JArray('object')(fuzzterms, Term))
938                 else:
939                     phrase.add(term)
940         else:
941             phrase = PhraseQuery()
942             phrase.setSlop(slop)
943             for t in tokens:
944                 term = Term(field, t)
945                 phrase.add(term)
946         return phrase
947
948     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
949         """
950         Returns term queries joined by boolean query.
951         modal - applies to boolean query
952         fuzzy - should the query by fuzzy.
953         """
954         q = BooleanQuery()
955         for t in tokens:
956             term = Term(field, t)
957             if fuzzy:
958                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
959             else:
960                 term = TermQuery(term)
961             q.add(BooleanClause(term, modal))
962         return q
963
964     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
965                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
966         if filters is None: filters = []
967         if tokens_cache is None: tokens_cache = {}
968
969         tokens = self.get_tokens(searched, field, cached=tokens_cache)
970
971         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
972         if book:
973             filters.append(self.term_filter(Term('is_book', 'true')))
974         top = self.searcher.search(query, self.chain_filters(filters), max_results)
975
976         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
977
978     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
979                     filters=None, tokens_cache=None, boost=None, snippets=True):
980         if filters is None: filters = []
981         if tokens_cache is None: tokens_cache = {}
982
983         if book:
984             filters.append(self.term_filter(Term('is_book', 'true')))
985
986         query = BooleanQuery()
987
988         for fld in fields:
989             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
990
991             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
992                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
993
994         top = self.searcher.search(query, self.chain_filters(filters), max_results)
995
996         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
997                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
998
999     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1000         """
1001         Search for perfect book matches. Just see if the query matches with some author or title,
1002         taking hints into account.
1003         """
1004         fields_to_search = ['authors', 'title']
1005         only_in = None
1006         if hint:
1007             if not hint.should_search_for_book():
1008                 return []
1009             fields_to_search = hint.just_search_in(fields_to_search)
1010             only_in = hint.book_filter()
1011
1012         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1013
1014         books = []
1015         for q in qrys:
1016             top = self.searcher.search(q,
1017                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1018                 max_results)
1019             for found in top.scoreDocs:
1020                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1021         return books
1022
1023     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1024         fields_to_search = ['tags', 'authors', 'title']
1025
1026         only_in = None
1027         if hint:
1028             if not hint.should_search_for_book():
1029                 return []
1030             fields_to_search = hint.just_search_in(fields_to_search)
1031             only_in = hint.book_filter()
1032
1033         tokens = self.get_tokens(searched, field='SIMPLE')
1034
1035         q = BooleanQuery()
1036
1037         for fld in fields_to_search:
1038             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1039                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1040
1041         books = []
1042         top = self.searcher.search(q,
1043                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1044             max_results)
1045         for found in top.scoreDocs:
1046             books.append(SearchResult(self, found, how_found="search_book"))
1047
1048         return books
1049
1050     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1051         """
1052         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1053         some part/fragment of the book.
1054         """
1055         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1056
1057         flt = None
1058         if hint:
1059             flt = hint.part_filter()
1060
1061         books = []
1062         for q in qrys:
1063             top = self.searcher.search(q,
1064                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1065                                                            flt]),
1066                                        max_results)
1067             for found in top.scoreDocs:
1068                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1069
1070         return books
1071
1072     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1073         """
1074         Tries to use search terms to match different fields of book (or its parts).
1075         E.g. one word can be an author survey, another be a part of the title, and the rest
1076         are some words from third chapter.
1077         """
1078         if tokens_cache is None: tokens_cache = {}
1079         books = []
1080         only_in = None
1081
1082         if hint:
1083             only_in = hint.part_filter()
1084
1085         # content only query : themes x content
1086         q = BooleanQuery()
1087
1088         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1089         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1090
1091         # only search in themes when we do not already filter by themes
1092         if hint is None or hint.just_search_in(['themes']) != []:
1093             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1094                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1095
1096         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1097                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1098
1099         topDocs = self.searcher.search(q, only_in, max_results)
1100         for found in topDocs.scoreDocs:
1101             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1102             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1103
1104         # query themes/content x author/title/tags
1105         q = BooleanQuery()
1106         in_content = BooleanQuery()
1107         in_meta = BooleanQuery()
1108
1109         for fld in ['themes_pl', 'content']:
1110             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1111
1112         for fld in ['tags', 'authors', 'title']:
1113             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1114
1115         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1116         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1117
1118         topDocs = self.searcher.search(q, only_in, max_results)
1119         for found in topDocs.scoreDocs:
1120             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1121             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1122
1123         return books
1124
1125     # def multisearch(self, query, max_results=50):
1126     #     """
1127     #     Search strategy:
1128     #     - (phrase) OR -> content
1129     #                   -> title
1130     #                   -> authors
1131     #     - (keywords)  -> authors
1132     #                   -> motyw
1133     #                   -> tags
1134     #                   -> content
1135     #     """
1136         # queryreader = StringReader(query)
1137         # tokens = self.get_tokens(queryreader)
1138
1139         # top_level = BooleanQuery()
1140         # Should = BooleanClause.Occur.SHOULD
1141
1142         # phrase_level = BooleanQuery()
1143         # phrase_level.setBoost(1.3)
1144
1145         # p_content = self.make_phrase(tokens, joined=True)
1146         # p_title = self.make_phrase(tokens, 'title')
1147         # p_author = self.make_phrase(tokens, 'author')
1148
1149         # phrase_level.add(BooleanClause(p_content, Should))
1150         # phrase_level.add(BooleanClause(p_title, Should))
1151         # phrase_level.add(BooleanClause(p_author, Should))
1152
1153         # kw_level = BooleanQuery()
1154
1155         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1156         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1157         # kw_level.add(j_themes, Should)
1158         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1159         # j_con = self.make_term_query(tokens, joined=True)
1160         # kw_level.add(j_con, Should)
1161
1162         # top_level.add(BooleanClause(phrase_level, Should))
1163         # top_level.add(BooleanClause(kw_level, Should))
1164
1165         # return None
1166
1167     def get_snippets(self, scoreDoc, query, field='content'):
1168         """
1169         Returns a snippet for found scoreDoc.
1170         """
1171         htmlFormatter = SimpleHTMLFormatter()
1172         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1173
1174         stored = self.searcher.doc(scoreDoc.doc)
1175
1176         position = stored.get('snippets_position')
1177         length = stored.get('snippets_length')
1178         if position is None or length is None:
1179             return None
1180         # locate content.
1181         snippets = Snippets(stored.get('book_id')).open()
1182         try:
1183             text = snippets.get((int(position),
1184                                  int(length)))
1185         finally:
1186             snippets.close()
1187
1188         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1189         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1190         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1191
1192         return snip
1193
1194     @staticmethod
1195     def enum_to_array(enum):
1196         """
1197         Converts a lucene TermEnum to array of Terms, suitable for
1198         addition to queries
1199         """
1200         terms = []
1201
1202         while True:
1203             t = enum.term()
1204             if t:
1205                 terms.append(t)
1206             if not enum.next(): break
1207
1208         if terms:
1209             return JArray('object')(terms, Term)
1210
1211     def search_tags(self, query, filter=None, max_results=40):
1212         """
1213         Search for Tag objects using query.
1214         """
1215         tops = self.searcher.search(query, filter, max_results)
1216
1217         tags = []
1218         for found in tops.scoreDocs:
1219             doc = self.searcher.doc(found.doc)
1220             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1221             tags.append(tag)
1222             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1223
1224         return tags
1225
1226     def search_books(self, query, filter=None, max_results=10):
1227         """
1228         Searches for Book objects using query
1229         """
1230         bks = []
1231         tops = self.searcher.search(query, filter, max_results)
1232         for found in tops.scoreDocs:
1233             doc = self.searcher.doc(found.doc)
1234             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1235         return bks
1236
1237     def create_prefix_phrase(self, toks, field):
1238         q = MultiPhraseQuery()
1239         for i in range(len(toks)):
1240             t = Term(field, toks[i])
1241             if i == len(toks) - 1:
1242                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1243                 if pterms:
1244                     q.add(pterms)
1245                 else:
1246                     q.add(t)
1247             else:
1248                 q.add(t)
1249         return q
1250
1251     @staticmethod
1252     def term_filter(term, inverse=False):
1253         only_term = TermsFilter()
1254         only_term.addTerm(term)
1255
1256         if inverse:
1257             neg = BooleanFilter()
1258             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1259             only_term = neg
1260
1261         return only_term
1262
1263     def hint_tags(self, string, max_results=50):
1264         """
1265         Return auto-complete hints for tags
1266         using prefix search.
1267         """
1268         toks = self.get_tokens(string, field='SIMPLE')
1269         top = BooleanQuery()
1270
1271         for field in ['tag_name', 'tag_name_pl']:
1272             q = self.create_prefix_phrase(toks, field)
1273             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1274
1275         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1276
1277         return self.search_tags(top, no_book_cat, max_results=max_results)
1278
1279     def hint_books(self, string, max_results=50):
1280         """
1281         Returns auto-complete hints for book titles
1282         Because we do not index 'pseudo' title-tags.
1283         Prefix search.
1284         """
1285         toks = self.get_tokens(string, field='SIMPLE')
1286
1287         q = self.create_prefix_phrase(toks, 'title')
1288
1289         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1290
1291     @staticmethod
1292     def chain_filters(filters, op=ChainedFilter.AND):
1293         """
1294         Chains a filter list together
1295         """
1296         filters = filter(lambda x: x is not None, filters)
1297         if not filters or filters is []:
1298             return None
1299         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1300         return chf
1301
1302     def filtered_categories(self, tags):
1303         """
1304         Return a list of tag categories, present in tags list.
1305         """
1306         cats = {}
1307         for t in tags:
1308             cats[t.category] = True
1309         return cats.keys()
1310
1311     def hint(self):
1312         return Hint(self)