9617077d90939a8112f6cefb14f7f9f9418a09bb
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from multiprocessing.pool import ThreadPool
31 from threading import current_thread
32 import atexit
33 import traceback
34
35
36 class WLAnalyzer(PerFieldAnalyzerWrapper):
37     def __init__(self):
38         polish = PolishAnalyzer(Version.LUCENE_34)
39         #        polish_gap.setPositionIncrementGap(999)
40
41         simple = SimpleAnalyzer(Version.LUCENE_34)
42         #        simple_gap.setPositionIncrementGap(999)
43
44         keyword = KeywordAnalyzer(Version.LUCENE_34)
45
46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
47
48         PerFieldAnalyzerWrapper.__init__(self, polish)
49
50         self.addAnalyzer("tags", simple)
51         self.addAnalyzer("technical_editors", simple)
52         self.addAnalyzer("editors", simple)
53         self.addAnalyzer("url", keyword)
54         self.addAnalyzer("source_url", keyword)
55         self.addAnalyzer("source_name", simple)
56         self.addAnalyzer("publisher", simple)
57         self.addAnalyzer("authors", simple)
58         self.addAnalyzer("title", simple)
59
60         self.addAnalyzer("is_book", keyword)
61         # shouldn't the title have two forms? _pl and simple?
62
63         self.addAnalyzer("themes", simple)
64         self.addAnalyzer("themes_pl", polish)
65
66         self.addAnalyzer("tag_name", simple)
67         self.addAnalyzer("tag_name_pl", polish)
68
69         self.addAnalyzer("translators", simple)
70
71         self.addAnalyzer("KEYWORD", keyword)
72         self.addAnalyzer("SIMPLE", simple)
73         self.addAnalyzer("POLISH", polish)
74
75
76 class IndexStore(object):
77     """
78     Provides access to search index.
79
80     self.store - lucene index directory
81     """
82     def __init__(self):
83         self.make_index_dir()
84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
85
86     def make_index_dir(self):
87         try:
88             os.makedirs(settings.SEARCH_INDEX)
89         except OSError as exc:
90             if exc.errno == errno.EEXIST:
91                 pass
92             else: raise
93
94
95 class IndexChecker(IndexStore):
96     def __init__(self):
97         IndexStore.__init__(self)
98
99     def check(self):
100         checker = CheckIndex(self.store)
101         status = checker.checkIndex()
102         return status
103
104
105 class Snippets(object):
106     """
107     This class manages snippet files for indexed object (book)
108     the snippets are concatenated together, and their positions and
109     lengths are kept in lucene index fields.
110     """
111     SNIPPET_DIR = "snippets"
112
113     def __init__(self, book_id):
114         try:
115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
116         except OSError as exc:
117             if exc.errno == errno.EEXIST:
118                 pass
119             else: raise
120         self.book_id = book_id
121         self.file = None
122
123     def open(self, mode='r'):
124         """
125         Open the snippet file. Call .close() afterwards.
126         """
127         if not 'b' in mode:
128             mode += 'b'
129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
130         self.position = 0
131         return self
132
133     def add(self, snippet):
134         """
135         Append a snippet (unicode) to the snippet file.
136         Return a (position, length) tuple
137         """
138         txt = snippet.encode('utf-8')
139         l = len(txt)
140         self.file.write(txt)
141         pos = (self.position, l)
142         self.position += l
143         return pos
144
145     def get(self, pos):
146         """
147         Given a tuple of (position, length) return an unicode
148         of the snippet stored there.
149         """
150         self.file.seek(pos[0], 0)
151         txt = self.file.read(pos[1]).decode('utf-8')
152         return txt
153
154     def close(self):
155         """Close snippet file"""
156         self.file.close()
157
158
159 class BaseIndex(IndexStore):
160     """
161     Base index class.
162     Provides basic operations on index: opening, closing, optimizing.
163     """
164     def __init__(self, analyzer=None):
165         super(BaseIndex, self).__init__()
166         self.index = None
167         if not analyzer:
168             analyzer = WLAnalyzer()
169         self.analyzer = analyzer
170
171     def open(self, analyzer=None):
172         if self.index:
173             raise Exception("Index is already opened")
174         self.index = IndexWriter(self.store, self.analyzer,\
175                                  IndexWriter.MaxFieldLength.LIMITED)
176         return self.index
177
178     def optimize(self):
179         self.index.optimize()
180
181     def close(self):
182         try:
183             self.index.optimize()
184         except JavaError, je:
185             print "Error during optimize phase, check index: %s" % je
186
187         self.index.close()
188         self.index = None
189
190     def __enter__(self):
191         self.open()
192         return self
193
194     def __exit__(self, type, value, tb):
195         self.close()
196
197
198 class Index(BaseIndex):
199     """
200     Class indexing books.
201     """
202     def __init__(self, analyzer=None):
203         super(Index, self).__init__(analyzer)
204
205     def index_tags(self):
206         """
207         Re-index global tag list.
208         Removes all tags from index, then index them again.
209         Indexed fields include: id, name (with and without polish stems), category
210         """
211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
212         self.index.deleteDocuments(q)
213
214         for tag in catalogue.models.Tag.objects.all():
215             doc = Document()
216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
220             self.index.addDocument(doc)
221
222     def create_book_doc(self, book):
223         """
224         Create a lucene document referring book id.
225         """
226         doc = Document()
227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
228         if book.parent is not None:
229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
230         return doc
231
232     def remove_book(self, book):
233         """Removes a book from search index.
234         book - Book instance."""
235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
236         self.index.deleteDocuments(q)
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             self.remove_book(book)
246
247         book_doc = self.create_book_doc(book)
248         meta_fields = self.extract_metadata(book, book_info)
249         for f in meta_fields.values():
250             if isinstance(f, list) or isinstance(f, tuple):
251                 for elem in f:
252                     book_doc.add(elem)
253             else:
254                 book_doc.add(f)
255
256         self.index.addDocument(book_doc)
257         del book_doc
258
259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
260
261     master_tags = [
262         'opowiadanie',
263         'powiesc',
264         'dramat_wierszowany_l',
265         'dramat_wierszowany_lp',
266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267         'wywiad',
268         ]
269
270     ignore_content_tags = [
271         'uwaga', 'extra',
272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
273         'didaskalia',
274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
275         ]
276
277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
278
279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
280
281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
282
283     def extract_metadata(self, book, book_info=None):
284         """
285         Extract metadata from book and returns a map of fields keyed by fieldname
286         """
287         fields = {}
288
289         if book_info is None:
290             book_info = dcparser.parse(open(book.xml_file.path))
291
292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
295
296         # validator, name
297         for field in dcparser.BookInfo.FIELDS:
298             if hasattr(book_info, field.name):
299                 if not getattr(book_info, field.name):
300                     continue
301                 # since no type information is available, we use validator
302                 type_indicator = field.validator
303                 if type_indicator == dcparser.as_unicode:
304                     s = getattr(book_info, field.name)
305                     if field.multiple:
306                         s = ', '.join(s)
307                     try:
308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
309                     except JavaError as je:
310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
311                 elif type_indicator == dcparser.as_person:
312                     p = getattr(book_info, field.name)
313                     if isinstance(p, dcparser.Person):
314                         persons = unicode(p)
315                     else:
316                         persons = ', '.join(map(unicode, p))
317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
318                 elif type_indicator == dcparser.as_date:
319                     dt = getattr(book_info, field.name)
320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
322
323         # get published date
324         source = book_info.source_name
325         match = self.published_date_re.search(source)
326         print("published date is %s %s" % (match, match is not None and match.groups()))
327         if match is not None:
328             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
329
330         return fields
331
332     def add_gaps(self, fields, fieldname):
333         """
334         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
335         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
336         """
337         def gap():
338             while True:
339                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
340         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
341
342     def get_master(self, root):
343         """
344         Returns the first master tag from an etree.
345         """
346         for master in root.iter():
347             if master.tag in self.master_tags:
348                 return master
349
350     def index_content(self, book, book_fields=[]):
351         """
352         Walks the book XML and extract content from it.
353         Adds parts for each header tag and for each fragment.
354         """
355         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
356         root = wld.edoc.getroot()
357
358         master = self.get_master(root)
359         if master is None:
360             return []
361
362         def walker(node, ignore_tags=[]):
363             yield node, None
364             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
365                 for b, e in walker(child):
366                     yield b, e
367             yield None, node
368             return
369
370         def fix_format(text):
371             #            separator = [u" ", u"\t", u".", u";", u","]
372             if isinstance(text, list):
373                 # need to join it first
374                 text = filter(lambda s: s is not None, content)
375                 text = u' '.join(text)
376                 # for i in range(len(text)):
377                 #     if i > 0:
378                 #         if text[i][0] not in separator\
379                 #             and text[i - 1][-1] not in separator:
380                 #          text.insert(i, u" ")
381
382             return re.sub("(?m)/$", "", text)
383
384         def add_part(snippets, **fields):
385             doc = self.create_book_doc(book)
386             for f in book_fields:
387                 doc.add(f)
388
389             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
390             doc.add(NumericField("header_span", Field.Store.YES, True)\
391                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
392             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
393
394             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
395                           Field.TermVector.WITH_POSITIONS_OFFSETS))
396
397             snip_pos = snippets.add(fields["content"])
398             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
399             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
400
401             if 'fragment_anchor' in fields:
402                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
403                               Field.Store.YES, Field.Index.NOT_ANALYZED))
404
405             if 'themes' in fields:
406                 themes, themes_pl = zip(*[
407                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
408                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
409                      for theme in fields['themes']])
410
411                 themes = self.add_gaps(themes, 'themes')
412                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
413
414                 for t in themes:
415                     doc.add(t)
416                 for t in themes_pl:
417                     doc.add(t)
418
419             return doc
420
421         def give_me_utf8(s):
422             if isinstance(s, unicode):
423                 return s.encode('utf-8')
424             else:
425                 return s
426
427         fragments = {}
428         snippets = Snippets(book.id).open('w')
429         position = 0
430         try:
431             for header in list(master):
432
433                 if header.tag in self.skip_header_tags:
434                     continue
435                 if header.tag is etree.Comment:
436                     continue
437
438                 # section content
439                 content = []
440                 footnote = None
441
442                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
443                     # handle footnotes
444                     if start is not None and start.tag in self.footnote_tags:
445                         footnote = ' '.join(start.itertext())
446                     elif end is not None and footnote is not None and end.tag in self.footnote_tags:
447                         doc = add_part(snippets, header_index=position, header_type=header.tag,
448                                        content=footnote)
449
450                         self.index.addDocument(doc)
451
452                         footnote = None
453
454                     # handle fragments and themes.
455                     if start is not None and start.tag == 'begin':
456                         fid = start.attrib['id'][1:]
457                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
458
459                     elif start is not None and start.tag == 'motyw':
460                         fid = start.attrib['id'][1:]
461                         if start.text is not None:
462                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
463
464                     elif start is not None and start.tag == 'end':
465                         fid = start.attrib['id'][1:]
466                         if fid not in fragments:
467                             continue  # a broken <end> node, skip it
468                                       #                        import pdb; pdb.set_trace()
469                         frag = fragments[fid]
470                         if frag['themes'] == []:
471                             continue  # empty themes list.
472                         del fragments[fid]
473
474                         doc = add_part(snippets,
475                                        header_type=frag['start_header'],
476                                        header_index=frag['start_section'],
477                                        header_span=position - frag['start_section'] + 1,
478                                        fragment_anchor=fid,
479                                        content=fix_format(frag['content']),
480                                        themes=frag['themes'])
481
482                         self.index.addDocument(doc)
483
484                         # Collect content.
485                     elif start is not None:
486                         for frag in fragments.values():
487                             frag['content'].append(start.text)
488                         content.append(start.text)
489                     elif end is not None:
490                         for frag in fragments.values():
491                             frag['content'].append(end.tail)
492                         content.append(end.tail)
493
494                         # in the end, add a section text.
495                 doc = add_part(snippets, header_index=position, header_type=header.tag,
496                                content=fix_format(content))
497
498                 self.index.addDocument(doc)
499                 position += 1
500
501         finally:
502             snippets.close()
503
504
505 def log_exception_wrapper(f):
506     def _wrap(*a):
507         try:
508             f(*a)
509         except Exception, e:
510             print("Error in indexing thread: %s" % e)
511             traceback.print_exc()
512             raise e
513     return _wrap
514
515
516 class ReusableIndex(Index):
517     """
518     Works like index, but does not close/optimize Lucene index
519     until program exit (uses atexit hook).
520     This is usefull for importbooks command.
521
522     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
523     """
524     index = None
525
526     def open(self, analyzer=None, threads=4):
527         if ReusableIndex.index is not None:
528             self.index = ReusableIndex.index
529         else:
530             print("opening index")
531             Index.open(self, analyzer)
532             ReusableIndex.index = self.index
533             atexit.register(ReusableIndex.close_reusable)
534
535     # def index_book(self, *args, **kw):
536     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
537     #     ReusableIndex.pool_jobs.append(job)
538
539     @staticmethod
540     def close_reusable():
541         if ReusableIndex.index is not None:
542             ReusableIndex.index.optimize()
543             ReusableIndex.index.close()
544             ReusableIndex.index = None
545
546     def close(self):
547         pass
548
549
550 class JoinSearch(object):
551     """
552     This mixin could be used to handle block join queries.
553     (currently unused)
554     """
555     def __init__(self, *args, **kw):
556         super(JoinSearch, self).__init__(*args, **kw)
557
558     def wrapjoins(self, query, fields=[]):
559         """
560         This functions modifies the query in a recursive way,
561         so Term and Phrase Queries contained, which match
562         provided fields are wrapped in a BlockJoinQuery,
563         and so delegated to children documents.
564         """
565         if BooleanQuery.instance_(query):
566             qs = BooleanQuery.cast_(query)
567             for clause in qs:
568                 clause = BooleanClause.cast_(clause)
569                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
570             return qs
571         else:
572             termset = HashSet()
573             query.extractTerms(termset)
574             for t in termset:
575                 t = Term.cast_(t)
576                 if t.field() not in fields:
577                     return query
578             return BlockJoinQuery(query, self.parent_filter,
579                                   BlockJoinQuery.ScoreMode.Total)
580
581     def bsearch(self, query, max_results=50):
582         q = self.query(query)
583         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
584
585         tops = self.searcher.search(bjq, max_results)
586         bks = []
587         for found in tops.scoreDocs:
588             doc = self.searcher.doc(found.doc)
589             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
590         return (bks, tops.totalHits)
591
592
593 class SearchResult(object):
594     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
595         if tokens_cache is None: tokens_cache = {}
596
597         if score:
598             self._score = score
599         else:
600             self._score = scoreDocs.score
601
602         self.boost = 1.0
603
604         self._hits = []
605         self._processed_hits = None  # processed hits
606
607         stored = search.searcher.doc(scoreDocs.doc)
608         self.book_id = int(stored.get("book_id"))
609
610         header_type = stored.get("header_type")
611         if not header_type:
612             return
613
614         sec = (header_type, int(stored.get("header_index")))
615         header_span = stored.get('header_span')
616         header_span = header_span is not None and int(header_span) or 1
617
618         fragment = stored.get("fragment_anchor")
619
620         pd = stored.get("published_date")
621         if pd is None:
622             print "published_date is none for book %d" % self.book_id
623             pd = 0
624         self.published_date = int(pd)
625
626         if snippets:
627             snippets = snippets.replace("/\n", "\n")
628         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
629
630         self._hits.append(hit)
631
632         self.search = search
633         self.searched = searched
634         self.tokens_cache = tokens_cache
635
636     @property
637     def score(self):
638         return self._score * self.boost
639
640     def merge(self, other):
641         if self.book_id != other.book_id:
642             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
643         self._hits += other._hits
644         if other.score > self.score:
645             self._score = other._score
646         return self
647
648     def get_book(self):
649         return catalogue.models.Book.objects.get(id=self.book_id)
650
651     book = property(get_book)
652
653     @property
654     def hits(self):
655         if self._processed_hits is not None:
656             return self._processed_hits
657
658         POSITION = 0
659         FRAGMENT = 1
660         POSITION_INDEX = 1
661         POSITION_SPAN = 2
662         SCORE = 2
663         OTHER = 3
664
665         # to sections and fragments
666         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
667         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
668         sect = filter(lambda s: 0 == len(filter(
669             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
670             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
671             frags)), sect)
672
673         hits = []
674
675         # remove duplicate fragments
676         fragments = {}
677         for f in frags:
678             fid = f[FRAGMENT]
679             if fid in fragments:
680                 if fragments[fid][SCORE] >= f[SCORE]:
681                     continue
682             fragments[fid] = f
683         frags = fragments.values()
684
685         # remove duplicate sections
686         sections = {}
687
688         for s in sect:
689             si = s[POSITION][POSITION_INDEX]
690             # skip existing
691             if si in sections:
692                 if sections[si]['score'] >= s[SCORE]:
693                     continue
694
695             m = {'score': s[SCORE],
696                  'section_number': s[POSITION][POSITION_INDEX] + 1,
697                  }
698             m.update(s[OTHER])
699             sections[si] = m
700
701         hits = sections.values()
702
703         for f in frags:
704             try:
705                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
706             except catalogue.models.Fragment.DoesNotExist:
707                 # stale index
708                 continue
709
710             # Figure out if we were searching for a token matching some word in theme name.
711             themes = frag.tags.filter(category='theme')
712             themes_hit = []
713             if self.searched is not None:
714                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
715                 for theme in themes:
716                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
717                     for t in tokens:
718                         if t in name_tokens:
719                             if not theme in themes_hit:
720                                 themes_hit.append(theme)
721                             break
722
723             m = {'score': f[SCORE],
724                  'fragment': frag,
725                  'section_number': f[POSITION][POSITION_INDEX] + 1,
726                  'themes': themes,
727                  'themes_hit': themes_hit
728                  }
729             m.update(f[OTHER])
730             hits.append(m)
731
732         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
733
734         self._processed_hits = hits
735
736         return hits
737
738     def __unicode__(self):
739         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
740
741     @staticmethod
742     def aggregate(*result_lists):
743         books = {}
744         for rl in result_lists:
745             for r in rl:
746                 if r.book_id in books:
747                     books[r.book_id].merge(r)
748                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
749                 else:
750                     books[r.book_id] = r
751         return books.values()
752
753     def __cmp__(self, other):
754         c = cmp(self.score, other.score)
755         if c == 0:
756             # this is inverted, because earlier date is better
757             return cmp(other.published_date, self.published_date)
758         else:
759             return c
760
761
762 class Hint(object):
763     """
764     Given some hint information (information we already know about)
765     our search target - like author, title (specific book), epoch, genre, kind
766     we can narrow down search using filters.
767     """
768     def __init__(self, search):
769         """
770         Accepts a Searcher instance.
771         """
772         self.search = search
773         self.book_tags = {}
774         self.part_tags = []
775         self._books = []
776
777     def books(self, *books):
778         """
779         Give a hint that we search these books.
780         """
781         self._books = books
782
783     def tags(self, tags):
784         """
785         Give a hint that these Tag objects (a list of)
786         is necessary.
787         """
788         for t in tags:
789             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
790                 lst = self.book_tags.get(t.category, [])
791                 lst.append(t)
792                 self.book_tags[t.category] = lst
793             if t.category in ['theme', 'theme_pl']:
794                 self.part_tags.append(t)
795
796     def tag_filter(self, tags, field='tags'):
797         """
798         Given a lsit of tags and an optional field (but they are normally in tags field)
799         returns a filter accepting only books with specific tags.
800         """
801         q = BooleanQuery()
802
803         for tag in tags:
804             toks = self.search.get_tokens(tag.name, field=field)
805             tag_phrase = PhraseQuery()
806             for tok in toks:
807                 tag_phrase.add(Term(field, tok))
808             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
809
810         return QueryWrapperFilter(q)
811
812     def book_filter(self):
813         """
814         Filters using book tags (all tag kinds except a theme)
815         """
816         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
817         if tags:
818             return self.tag_filter(tags)
819         else:
820             return None
821
822     def part_filter(self):
823         """
824         This filter can be used to look for book parts.
825         It filters on book id and/or themes.
826         """
827         fs = []
828         if self.part_tags:
829             fs.append(self.tag_filter(self.part_tags, field='themes'))
830
831         if self._books != []:
832             bf = BooleanFilter()
833             for b in self._books:
834                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
835                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
836             fs.append(bf)
837
838         return Search.chain_filters(fs)
839
840     def should_search_for_book(self):
841         return self._books == []
842
843     def just_search_in(self, all):
844         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
845         some = []
846         for field in all:
847             if field == 'authors' and 'author' in self.book_tags:
848                 continue
849             if field == 'title' and self._books != []:
850                 continue
851             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
852                 continue
853             some.append(field)
854         return some
855
856
857 class Search(IndexStore):
858     """
859     Search facilities.
860     """
861     def __init__(self, default_field="content"):
862         IndexStore.__init__(self)
863         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
864         # self.analyzer = WLAnalyzer()
865         self.searcher = IndexSearcher(self.store, True)
866         self.parser = QueryParser(Version.LUCENE_34, default_field,
867                                   self.analyzer)
868
869         self.parent_filter = TermsFilter()
870         self.parent_filter.addTerm(Term("is_book", "true"))
871
872     def query(self, query):
873         """Parse query in default Lucene Syntax. (for humans)
874         """
875         return self.parser.parse(query)
876
877     def simple_search(self, query, max_results=50):
878         """Runs a query for books using lucene syntax. (for humans)
879         Returns (books, total_hits)
880         """
881
882         tops = self.searcher.search(self.query(query), max_results)
883         bks = []
884         for found in tops.scoreDocs:
885             doc = self.searcher.doc(found.doc)
886             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
887         return (bks, tops.totalHits)
888
889     def get_tokens(self, searched, field='content', cached=None):
890         """returns tokens analyzed by a proper (for a field) analyzer
891         argument can be: StringReader, string/unicode, or tokens. In the last case
892         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
893         """
894         if cached is not None and field in cached:
895             return cached[field]
896
897         if isinstance(searched, str) or isinstance(searched, unicode):
898             searched = StringReader(searched)
899         elif isinstance(searched, list):
900             return searched
901
902         searched.reset()
903         tokens = self.analyzer.reusableTokenStream(field, searched)
904         toks = []
905         while tokens.incrementToken():
906             cta = tokens.getAttribute(CharTermAttribute.class_)
907             toks.append(cta.toString())
908
909         if cached is not None:
910             cached[field] = toks
911
912         return toks
913
914     def fuzziness(self, fuzzy):
915         """Helper method to sanitize fuzziness"""
916         if not fuzzy:
917             return None
918         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
919             return fuzzy
920         else:
921             return 0.5
922
923     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
924         """
925         Return a PhraseQuery with a series of tokens.
926         """
927         if fuzzy:
928             phrase = MultiPhraseQuery()
929             for t in tokens:
930                 term = Term(field, t)
931                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
932                 fuzzterms = []
933
934                 while True:
935                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
936                     ft = fuzzterm.term()
937                     if ft:
938                         fuzzterms.append(ft)
939                     if not fuzzterm.next(): break
940                 if fuzzterms:
941                     phrase.add(JArray('object')(fuzzterms, Term))
942                 else:
943                     phrase.add(term)
944         else:
945             phrase = PhraseQuery()
946             phrase.setSlop(slop)
947             for t in tokens:
948                 term = Term(field, t)
949                 phrase.add(term)
950         return phrase
951
952     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
953         """
954         Returns term queries joined by boolean query.
955         modal - applies to boolean query
956         fuzzy - should the query by fuzzy.
957         """
958         q = BooleanQuery()
959         for t in tokens:
960             term = Term(field, t)
961             if fuzzy:
962                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
963             else:
964                 term = TermQuery(term)
965             q.add(BooleanClause(term, modal))
966         return q
967
968     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
969                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
970         if filters is None: filters = []
971         if tokens_cache is None: tokens_cache = {}
972
973         tokens = self.get_tokens(searched, field, cached=tokens_cache)
974
975         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
976         if book:
977             filters.append(self.term_filter(Term('is_book', 'true')))
978         top = self.searcher.search(query, self.chain_filters(filters), max_results)
979
980         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
981
982     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
983                     filters=None, tokens_cache=None, boost=None, snippets=True):
984         if filters is None: filters = []
985         if tokens_cache is None: tokens_cache = {}
986
987         if book:
988             filters.append(self.term_filter(Term('is_book', 'true')))
989
990         query = BooleanQuery()
991
992         for fld in fields:
993             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
994
995             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
996                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
997
998         top = self.searcher.search(query, self.chain_filters(filters), max_results)
999
1000         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1001                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1002
1003     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1004         """
1005         Search for perfect book matches. Just see if the query matches with some author or title,
1006         taking hints into account.
1007         """
1008         fields_to_search = ['authors', 'title']
1009         only_in = None
1010         if hint:
1011             if not hint.should_search_for_book():
1012                 return []
1013             fields_to_search = hint.just_search_in(fields_to_search)
1014             only_in = hint.book_filter()
1015
1016         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1017
1018         books = []
1019         for q in qrys:
1020             top = self.searcher.search(q,
1021                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1022                 max_results)
1023             for found in top.scoreDocs:
1024                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1025         return books
1026
1027     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1028         fields_to_search = ['tags', 'authors', 'title']
1029
1030         only_in = None
1031         if hint:
1032             if not hint.should_search_for_book():
1033                 return []
1034             fields_to_search = hint.just_search_in(fields_to_search)
1035             only_in = hint.book_filter()
1036
1037         tokens = self.get_tokens(searched, field='SIMPLE')
1038
1039         q = BooleanQuery()
1040
1041         for fld in fields_to_search:
1042             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1043                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1044
1045         books = []
1046         top = self.searcher.search(q,
1047                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1048             max_results)
1049         for found in top.scoreDocs:
1050             books.append(SearchResult(self, found, how_found="search_book"))
1051
1052         return books
1053
1054     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1055         """
1056         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1057         some part/fragment of the book.
1058         """
1059         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1060
1061         flt = None
1062         if hint:
1063             flt = hint.part_filter()
1064
1065         books = []
1066         for q in qrys:
1067             top = self.searcher.search(q,
1068                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1069                                                            flt]),
1070                                        max_results)
1071             for found in top.scoreDocs:
1072                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1073
1074         return books
1075
1076     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1077         """
1078         Tries to use search terms to match different fields of book (or its parts).
1079         E.g. one word can be an author survey, another be a part of the title, and the rest
1080         are some words from third chapter.
1081         """
1082         if tokens_cache is None: tokens_cache = {}
1083         books = []
1084         only_in = None
1085
1086         if hint:
1087             only_in = hint.part_filter()
1088
1089         # content only query : themes x content
1090         q = BooleanQuery()
1091
1092         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1093         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1094
1095         # only search in themes when we do not already filter by themes
1096         if hint is None or hint.just_search_in(['themes']) != []:
1097             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1098                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1099
1100         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1101                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1102
1103         topDocs = self.searcher.search(q, only_in, max_results)
1104         for found in topDocs.scoreDocs:
1105             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1106             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1107
1108         # query themes/content x author/title/tags
1109         q = BooleanQuery()
1110         in_content = BooleanQuery()
1111         in_meta = BooleanQuery()
1112
1113         for fld in ['themes_pl', 'content']:
1114             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1115
1116         for fld in ['tags', 'authors', 'title']:
1117             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1118
1119         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1120         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1121
1122         topDocs = self.searcher.search(q, only_in, max_results)
1123         for found in topDocs.scoreDocs:
1124             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1125             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1126
1127         return books
1128
1129     # def multisearch(self, query, max_results=50):
1130     #     """
1131     #     Search strategy:
1132     #     - (phrase) OR -> content
1133     #                   -> title
1134     #                   -> authors
1135     #     - (keywords)  -> authors
1136     #                   -> motyw
1137     #                   -> tags
1138     #                   -> content
1139     #     """
1140         # queryreader = StringReader(query)
1141         # tokens = self.get_tokens(queryreader)
1142
1143         # top_level = BooleanQuery()
1144         # Should = BooleanClause.Occur.SHOULD
1145
1146         # phrase_level = BooleanQuery()
1147         # phrase_level.setBoost(1.3)
1148
1149         # p_content = self.make_phrase(tokens, joined=True)
1150         # p_title = self.make_phrase(tokens, 'title')
1151         # p_author = self.make_phrase(tokens, 'author')
1152
1153         # phrase_level.add(BooleanClause(p_content, Should))
1154         # phrase_level.add(BooleanClause(p_title, Should))
1155         # phrase_level.add(BooleanClause(p_author, Should))
1156
1157         # kw_level = BooleanQuery()
1158
1159         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1160         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1161         # kw_level.add(j_themes, Should)
1162         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1163         # j_con = self.make_term_query(tokens, joined=True)
1164         # kw_level.add(j_con, Should)
1165
1166         # top_level.add(BooleanClause(phrase_level, Should))
1167         # top_level.add(BooleanClause(kw_level, Should))
1168
1169         # return None
1170
1171     def get_snippets(self, scoreDoc, query, field='content'):
1172         """
1173         Returns a snippet for found scoreDoc.
1174         """
1175         htmlFormatter = SimpleHTMLFormatter()
1176         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1177
1178         stored = self.searcher.doc(scoreDoc.doc)
1179
1180         position = stored.get('snippets_position')
1181         length = stored.get('snippets_length')
1182         if position is None or length is None:
1183             return None
1184         # locate content.
1185         snippets = Snippets(stored.get('book_id')).open()
1186         try:
1187             text = snippets.get((int(position),
1188                                  int(length)))
1189         finally:
1190             snippets.close()
1191
1192         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1193         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1194         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1195
1196         return snip
1197
1198     @staticmethod
1199     def enum_to_array(enum):
1200         """
1201         Converts a lucene TermEnum to array of Terms, suitable for
1202         addition to queries
1203         """
1204         terms = []
1205
1206         while True:
1207             t = enum.term()
1208             if t:
1209                 terms.append(t)
1210             if not enum.next(): break
1211
1212         if terms:
1213             return JArray('object')(terms, Term)
1214
1215     def search_tags(self, query, filter=None, max_results=40):
1216         """
1217         Search for Tag objects using query.
1218         """
1219         tops = self.searcher.search(query, filter, max_results)
1220
1221         tags = []
1222         for found in tops.scoreDocs:
1223             doc = self.searcher.doc(found.doc)
1224             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1225             tags.append(tag)
1226             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1227
1228         return tags
1229
1230     def search_books(self, query, filter=None, max_results=10):
1231         """
1232         Searches for Book objects using query
1233         """
1234         bks = []
1235         tops = self.searcher.search(query, filter, max_results)
1236         for found in tops.scoreDocs:
1237             doc = self.searcher.doc(found.doc)
1238             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1239         return bks
1240
1241     def create_prefix_phrase(self, toks, field):
1242         q = MultiPhraseQuery()
1243         for i in range(len(toks)):
1244             t = Term(field, toks[i])
1245             if i == len(toks) - 1:
1246                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1247                 if pterms:
1248                     q.add(pterms)
1249                 else:
1250                     q.add(t)
1251             else:
1252                 q.add(t)
1253         return q
1254
1255     @staticmethod
1256     def term_filter(term, inverse=False):
1257         only_term = TermsFilter()
1258         only_term.addTerm(term)
1259
1260         if inverse:
1261             neg = BooleanFilter()
1262             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1263             only_term = neg
1264
1265         return only_term
1266
1267     def hint_tags(self, string, max_results=50):
1268         """
1269         Return auto-complete hints for tags
1270         using prefix search.
1271         """
1272         toks = self.get_tokens(string, field='SIMPLE')
1273         top = BooleanQuery()
1274
1275         for field in ['tag_name', 'tag_name_pl']:
1276             q = self.create_prefix_phrase(toks, field)
1277             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1278
1279         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1280
1281         return self.search_tags(top, no_book_cat, max_results=max_results)
1282
1283     def hint_books(self, string, max_results=50):
1284         """
1285         Returns auto-complete hints for book titles
1286         Because we do not index 'pseudo' title-tags.
1287         Prefix search.
1288         """
1289         toks = self.get_tokens(string, field='SIMPLE')
1290
1291         q = self.create_prefix_phrase(toks, 'title')
1292
1293         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1294
1295     @staticmethod
1296     def chain_filters(filters, op=ChainedFilter.AND):
1297         """
1298         Chains a filter list together
1299         """
1300         filters = filter(lambda x: x is not None, filters)
1301         if not filters or filters is []:
1302             return None
1303         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1304         return chf
1305
1306     def filtered_categories(self, tags):
1307         """
1308         Return a list of tag categories, present in tags list.
1309         """
1310         cats = {}
1311         for t in tags:
1312             cats[t.category] = True
1313         return cats.keys()
1314
1315     def hint(self):
1316         return Hint(self)