cotnent indexing - bugs.
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("authors", simple)
57         self.addAnalyzer("title", simple)
58
59         self.addAnalyzer("is_book", keyword)
60         # shouldn't the title have two forms? _pl and simple?
61
62         self.addAnalyzer("themes", simple)
63         self.addAnalyzer("themes_pl", polish)
64
65         self.addAnalyzer("tag_name", simple)
66         self.addAnalyzer("tag_name_pl", polish)
67
68         self.addAnalyzer("translators", simple)
69
70         self.addAnalyzer("KEYWORD", keyword)
71         self.addAnalyzer("SIMPLE", simple)
72         self.addAnalyzer("POLISH", polish)
73
74
75 class IndexStore(object):
76     """
77     Provides access to search index.
78
79     self.store - lucene index directory
80     """
81     def __init__(self):
82         self.make_index_dir()
83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
84
85     def make_index_dir(self):
86         try:
87             os.makedirs(settings.SEARCH_INDEX)
88         except OSError as exc:
89             if exc.errno == errno.EEXIST:
90                 pass
91             else: raise
92
93
94 class IndexChecker(IndexStore):
95     def __init__(self):
96         IndexStore.__init__(self)
97
98     def check(self):
99         checker = CheckIndex(self.store)
100         status = checker.checkIndex()
101         return status
102
103
104 class Snippets(object):
105     """
106     This class manages snippet files for indexed object (book)
107     the snippets are concatenated together, and their positions and
108     lengths are kept in lucene index fields.
109     """
110     SNIPPET_DIR = "snippets"
111
112     def __init__(self, book_id):
113         try:
114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
115         except OSError as exc:
116             if exc.errno == errno.EEXIST:
117                 pass
118             else: raise
119         self.book_id = book_id
120         self.file = None
121
122     def open(self, mode='r'):
123         """
124         Open the snippet file. Call .close() afterwards.
125         """
126         if not 'b' in mode:
127             mode += 'b'
128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
129         self.position = 0
130         return self
131
132     def add(self, snippet):
133         """
134         Append a snippet (unicode) to the snippet file.
135         Return a (position, length) tuple
136         """
137         txt = snippet.encode('utf-8')
138         l = len(txt)
139         self.file.write(txt)
140         pos = (self.position, l)
141         self.position += l
142         return pos
143
144     def get(self, pos):
145         """
146         Given a tuple of (position, length) return an unicode
147         of the snippet stored there.
148         """
149         self.file.seek(pos[0], 0)
150         txt = self.file.read(pos[1]).decode('utf-8')
151         return txt
152
153     def close(self):
154         """Close snippet file"""
155         self.file.close()
156
157
158 class BaseIndex(IndexStore):
159     """
160     Base index class.
161     Provides basic operations on index: opening, closing, optimizing.
162     """
163     def __init__(self, analyzer=None):
164         super(BaseIndex, self).__init__()
165         self.index = None
166         if not analyzer:
167             analyzer = WLAnalyzer()
168         self.analyzer = analyzer
169
170     def open(self, analyzer=None):
171         if self.index:
172             raise Exception("Index is already opened")
173         self.index = IndexWriter(self.store, self.analyzer,\
174                                  IndexWriter.MaxFieldLength.LIMITED)
175         return self.index
176
177     def optimize(self):
178         self.index.optimize()
179
180     def close(self):
181         try:
182             self.index.optimize()
183         except JavaError, je:
184             print "Error during optimize phase, check index: %s" % je
185
186         self.index.close()
187         self.index = None
188
189     def __enter__(self):
190         self.open()
191         return self
192
193     def __exit__(self, type, value, tb):
194         self.close()
195
196
197 class Index(BaseIndex):
198     """
199     Class indexing books.
200     """
201     def __init__(self, analyzer=None):
202         super(Index, self).__init__(analyzer)
203
204     def index_tags(self):
205         """
206         Re-index global tag list.
207         Removes all tags from index, then index them again.
208         Indexed fields include: id, name (with and without polish stems), category
209         """
210         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
211         self.index.deleteDocuments(q)
212
213         for tag in catalogue.models.Tag.objects.all():
214             doc = Document()
215             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
216             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
217             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
219             self.index.addDocument(doc)
220
221     def create_book_doc(self, book):
222         """
223         Create a lucene document referring book id.
224         """
225         doc = Document()
226         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
227         if book.parent is not None:
228             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
229         return doc
230
231     def remove_book(self, book):
232         """Removes a book from search index.
233         book - Book instance."""
234         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
235         self.index.deleteDocuments(q)
236
237     def index_book(self, book, book_info=None, overwrite=True):
238         """
239         Indexes the book.
240         Creates a lucene document for extracted metadata
241         and calls self.index_content() to index the contents of the book.
242         """
243         if overwrite:
244             self.remove_book(book)
245
246         book_doc = self.create_book_doc(book)
247         meta_fields = self.extract_metadata(book, book_info)
248         for f in meta_fields.values():
249             if isinstance(f, list) or isinstance(f, tuple):
250                 for elem in f:
251                     book_doc.add(elem)
252             else:
253                 book_doc.add(f)
254
255         self.index.addDocument(book_doc)
256         del book_doc
257
258         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
259
260     master_tags = [
261         'opowiadanie',
262         'powiesc',
263         'dramat_wierszowany_l',
264         'dramat_wierszowany_lp',
265         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
266         'wywiad'
267         ]
268
269     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
270
271     def extract_metadata(self, book, book_info=None):
272         """
273         Extract metadata from book and returns a map of fields keyed by fieldname
274         """
275         fields = {}
276
277         if book_info is None:
278             book_info = dcparser.parse(open(book.xml_file.path))
279
280         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
281         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
282         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
283
284         # validator, name
285         for field in dcparser.BookInfo.FIELDS:
286             if hasattr(book_info, field.name):
287                 if not getattr(book_info, field.name):
288                     continue
289                 # since no type information is available, we use validator
290                 type_indicator = field.validator
291                 if type_indicator == dcparser.as_unicode:
292                     s = getattr(book_info, field.name)
293                     if field.multiple:
294                         s = ', '.join(s)
295                     try:
296                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
297                     except JavaError as je:
298                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
299                 elif type_indicator == dcparser.as_person:
300                     p = getattr(book_info, field.name)
301                     if isinstance(p, dcparser.Person):
302                         persons = unicode(p)
303                     else:
304                         persons = ', '.join(map(unicode, p))
305                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
306                 elif type_indicator == dcparser.as_date:
307                     dt = getattr(book_info, field.name)
308                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
309                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
310
311         return fields
312
313     def add_gaps(self, fields, fieldname):
314         """
315         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
316         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
317         """
318         def gap():
319             while True:
320                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
321         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
322
323     def get_master(self, root):
324         """
325         Returns the first master tag from an etree.
326         """
327         for master in root.iter():
328             if master.tag in self.master_tags:
329                 return master
330
331     def index_content(self, book, book_fields=[]):
332         """
333         Walks the book XML and extract content from it.
334         Adds parts for each header tag and for each fragment.
335         """
336         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
337         root = wld.edoc.getroot()
338
339         master = self.get_master(root)
340         if master is None:
341             return []
342
343         def walker(node):
344             yield node, None
345             for child in list(node):
346                 for b, e in walker(child):
347                     yield b, e
348             yield None, node
349             return
350
351         def fix_format(text):
352             return re.sub("(?m)/$", "", text)
353
354         def add_part(snippets, **fields):
355             doc = self.create_book_doc(book)
356             for f in book_fields:
357                 doc.add(f)
358
359             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
360             doc.add(NumericField("header_span", Field.Store.YES, True)\
361                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
362             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
363
364             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
365                           Field.TermVector.WITH_POSITIONS_OFFSETS))
366
367             snip_pos = snippets.add(fields["content"])
368             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
369             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
370
371             if 'fragment_anchor' in fields:
372                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
373                               Field.Store.YES, Field.Index.NOT_ANALYZED))
374
375             if 'themes' in fields:
376                 themes, themes_pl = zip(*[
377                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
378                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
379                      for theme in fields['themes']])
380
381                 themes = self.add_gaps(themes, 'themes')
382                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
383
384                 for t in themes:
385                     doc.add(t)
386                 for t in themes_pl:
387                     doc.add(t)
388
389             return doc
390
391         def give_me_utf8(s):
392             if isinstance(s, unicode):
393                 return s.encode('utf-8')
394             else:
395                 return s
396
397         fragments = {}
398         snippets = Snippets(book.id).open('w')
399         try:
400             for header, position in zip(list(master), range(len(master))):
401
402                 if header.tag in self.skip_header_tags:
403                     continue
404
405                 # section content
406                 content = []
407
408                 for start, end in walker(header):
409                         # handle fragments and themes.
410                     if start is not None and start.tag == 'begin':
411                         fid = start.attrib['id'][1:]
412                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
413
414                     elif start is not None and start.tag == 'motyw':
415                         fid = start.attrib['id'][1:]
416                         if start.text is not None:
417                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
418
419                     elif start is not None and start.tag == 'end':
420                         fid = start.attrib['id'][1:]
421                         if fid not in fragments:
422                             continue  # a broken <end> node, skip it
423                                       #                        import pdb; pdb.set_trace()
424                         frag = fragments[fid]
425                         if frag['themes'] == []:
426                             continue  # empty themes list.
427                         del fragments[fid]
428
429                         def jstr(l):
430                             return u' '.join(map(
431                                 lambda x: x == None and u'(none)' or unicode(x),
432                                 l))
433
434                         doc = add_part(snippets,
435                                        header_type=frag['start_header'],
436                                        header_index=frag['start_section'],
437                                        header_span=position - frag['start_section'] + 1,
438                                        fragment_anchor=fid,
439                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
440                                        themes=frag['themes'])
441
442                         self.index.addDocument(doc)
443
444                         # Collect content.
445                     elif start is not None:
446                         for frag in fragments.values():
447                             frag['content'].append(start.text)
448                         content.append(start.text)
449                     elif end is not None:
450                         for frag in fragments.values():
451                             frag['content'].append(end.tail)
452                         content.append(end.tail)
453
454                         # in the end, add a section text.
455                 doc = add_part(snippets, header_index=position, header_type=header.tag,
456                                content=fix_format(u' '.join(filter(lambda s: s is not None, content))))
457
458                 self.index.addDocument(doc)
459
460         finally:
461             snippets.close()
462
463
464 def log_exception_wrapper(f):
465     def _wrap(*a):
466         try:
467             f(*a)
468         except Exception, e:
469             print("Error in indexing thread: %s" % e)
470             traceback.print_exc()
471             raise e
472     return _wrap
473
474
475 class ReusableIndex(Index):
476     """
477     Works like index, but does not close/optimize Lucene index
478     until program exit (uses atexit hook).
479     This is usefull for importbooks command.
480
481     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
482     """
483     index = None
484
485     def open(self, analyzer=None, threads=4):
486         if ReusableIndex.index is not None:
487             self.index = ReusableIndex.index
488         else:
489             print("opening index")
490             Index.open(self, analyzer)
491             ReusableIndex.index = self.index
492             atexit.register(ReusableIndex.close_reusable)
493
494     # def index_book(self, *args, **kw):
495     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
496     #     ReusableIndex.pool_jobs.append(job)
497
498     @staticmethod
499     def close_reusable():
500         if ReusableIndex.index is not None:
501             ReusableIndex.index.optimize()
502             ReusableIndex.index.close()
503             ReusableIndex.index = None
504
505     def close(self):
506         pass
507
508
509 class JoinSearch(object):
510     """
511     This mixin could be used to handle block join queries.
512     (currently unused)
513     """
514     def __init__(self, *args, **kw):
515         super(JoinSearch, self).__init__(*args, **kw)
516
517     def wrapjoins(self, query, fields=[]):
518         """
519         This functions modifies the query in a recursive way,
520         so Term and Phrase Queries contained, which match
521         provided fields are wrapped in a BlockJoinQuery,
522         and so delegated to children documents.
523         """
524         if BooleanQuery.instance_(query):
525             qs = BooleanQuery.cast_(query)
526             for clause in qs:
527                 clause = BooleanClause.cast_(clause)
528                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
529             return qs
530         else:
531             termset = HashSet()
532             query.extractTerms(termset)
533             for t in termset:
534                 t = Term.cast_(t)
535                 if t.field() not in fields:
536                     return query
537             return BlockJoinQuery(query, self.parent_filter,
538                                   BlockJoinQuery.ScoreMode.Total)
539
540     def bsearch(self, query, max_results=50):
541         q = self.query(query)
542         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
543
544         tops = self.searcher.search(bjq, max_results)
545         bks = []
546         for found in tops.scoreDocs:
547             doc = self.searcher.doc(found.doc)
548             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
549         return (bks, tops.totalHits)
550
551
552 class SearchResult(object):
553     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
554         if score:
555             self.score = score
556         else:
557             self.score = scoreDocs.score
558
559         self._hits = []
560         self.hits = None  # processed hits
561
562         stored = searcher.doc(scoreDocs.doc)
563         self.book_id = int(stored.get("book_id"))
564
565         header_type = stored.get("header_type")
566         if not header_type:
567             return
568
569         sec = (header_type, int(stored.get("header_index")))
570         header_span = stored.get('header_span')
571         header_span = header_span is not None and int(header_span) or 1
572
573         fragment = stored.get("fragment_anchor")
574
575         if snippets:
576             snippets = snippets.replace("/\n", "\n")
577         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
578
579         self._hits.append(hit)
580
581     def merge(self, other):
582         if self.book_id != other.book_id:
583             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
584         self._hits += other._hits
585         if other.score > self.score:
586             self.score = other.score
587         return self
588
589     def get_book(self):
590         return catalogue.models.Book.objects.get(id=self.book_id)
591
592     book = property(get_book)
593
594     def process_hits(self):
595         POSITION = 0
596         FRAGMENT = 1
597         POSITION_INDEX = 1
598         POSITION_SPAN = 2
599         SCORE = 2
600         OTHER = 3
601
602         # to sections and fragments
603         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
604         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
605         sect = filter(lambda s: 0 == len(filter(
606             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
607             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
608             frags)), sect)
609
610         hits = []
611
612         # remove duplicate fragments
613         fragments = {}
614         for f in frags:
615             fid = f[FRAGMENT]
616             if fid in fragments:
617                 if fragments[fid][SCORE] >= f[SCORE]:
618                     continue
619             fragments[fid] = f
620         frags = fragments.values()
621
622         # remove duplicate sections
623         sections = {}
624
625         for s in sect:
626             si = s[POSITION][POSITION_INDEX]
627             # skip existing
628             if si in sections:
629                 if sections[si]['score'] >= s[SCORE]:
630                     continue
631
632             m = {'score': s[SCORE],
633                  'section_number': s[POSITION][POSITION_INDEX] + 1,
634                  }
635             m.update(s[OTHER])
636             sections[si] = m
637
638         hits = sections.values()
639
640         for f in frags:
641             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
642             m = {'score': f[SCORE],
643                  'fragment': frag,
644                  'section_number': f[POSITION][POSITION_INDEX] + 1,
645                  'themes': frag.tags.filter(category='theme')
646                  }
647             m.update(f[OTHER])
648             hits.append(m)
649
650         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
651
652         self.hits = hits
653
654         return self
655
656     def __unicode__(self):
657         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
658
659     @staticmethod
660     def aggregate(*result_lists):
661         books = {}
662         for rl in result_lists:
663             for r in rl:
664                 if r.book_id in books:
665                     books[r.book_id].merge(r)
666                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
667                 else:
668                     books[r.book_id] = r
669         return books.values()
670
671     def __cmp__(self, other):
672         return cmp(self.score, other.score)
673
674
675 class Hint(object):
676     """
677     Given some hint information (information we already know about)
678     our search target - like author, title (specific book), epoch, genre, kind
679     we can narrow down search using filters.
680     """
681     def __init__(self, search):
682         """
683         Accepts a Searcher instance.
684         """
685         self.search = search
686         self.book_tags = {}
687         self.part_tags = []
688         self._books = []
689
690     def books(self, *books):
691         """
692         Give a hint that we search these books.
693         """
694         self._books = books
695
696     def tags(self, tags):
697         """
698         Give a hint that these Tag objects (a list of)
699         is necessary.
700         """
701         for t in tags:
702             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
703                 lst = self.book_tags.get(t.category, [])
704                 lst.append(t)
705                 self.book_tags[t.category] = lst
706             if t.category in ['theme', 'theme_pl']:
707                 self.part_tags.append(t)
708
709     def tag_filter(self, tags, field='tags'):
710         """
711         Given a lsit of tags and an optional field (but they are normally in tags field)
712         returns a filter accepting only books with specific tags.
713         """
714         q = BooleanQuery()
715
716         for tag in tags:
717             toks = self.search.get_tokens(tag.name, field=field)
718             tag_phrase = PhraseQuery()
719             for tok in toks:
720                 tag_phrase.add(Term(field, tok))
721             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
722
723         return QueryWrapperFilter(q)
724
725     def book_filter(self):
726         """
727         Filters using book tags (all tag kinds except a theme)
728         """
729         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
730         if tags:
731             return self.tag_filter(tags)
732         else:
733             return None
734
735     def part_filter(self):
736         """
737         This filter can be used to look for book parts.
738         It filters on book id and/or themes.
739         """
740         fs = []
741         if self.part_tags:
742             fs.append(self.tag_filter(self.part_tags, field='themes'))
743
744         if self._books != []:
745             bf = BooleanFilter()
746             for b in self._books:
747                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
748                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
749             fs.append(bf)
750
751         return Search.chain_filters(fs)
752
753     def should_search_for_book(self):
754         return self._books == []
755
756     def just_search_in(self, all):
757         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
758         some = []
759         for field in all:
760             if field == 'authors' and 'author' in self.book_tags:
761                 continue
762             if field == 'title' and self._books != []:
763                 continue
764             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
765                 continue
766             some.append(field)
767         return some
768
769
770 class Search(IndexStore):
771     """
772     Search facilities.
773     """
774     def __init__(self, default_field="content"):
775         IndexStore.__init__(self)
776         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
777         # self.analyzer = WLAnalyzer()
778         self.searcher = IndexSearcher(self.store, True)
779         self.parser = QueryParser(Version.LUCENE_34, default_field,
780                                   self.analyzer)
781
782         self.parent_filter = TermsFilter()
783         self.parent_filter.addTerm(Term("is_book", "true"))
784
785     def query(self, query):
786         """Parse query in default Lucene Syntax. (for humans)
787         """
788         return self.parser.parse(query)
789
790     def simple_search(self, query, max_results=50):
791         """Runs a query for books using lucene syntax. (for humans)
792         Returns (books, total_hits)
793         """
794
795         tops = self.searcher.search(self.query(query), max_results)
796         bks = []
797         for found in tops.scoreDocs:
798             doc = self.searcher.doc(found.doc)
799             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
800         return (bks, tops.totalHits)
801
802     def get_tokens(self, searched, field='content'):
803         """returns tokens analyzed by a proper (for a field) analyzer
804         argument can be: StringReader, string/unicode, or tokens. In the last case
805         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
806         """
807         if isinstance(searched, str) or isinstance(searched, unicode):
808             searched = StringReader(searched)
809         elif isinstance(searched, list):
810             return searched
811
812         searched.reset()
813         tokens = self.analyzer.reusableTokenStream(field, searched)
814         toks = []
815         while tokens.incrementToken():
816             cta = tokens.getAttribute(CharTermAttribute.class_)
817             toks.append(cta.toString())
818         return toks
819
820     def fuzziness(self, fuzzy):
821         """Helper method to sanitize fuzziness"""
822         if not fuzzy:
823             return None
824         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
825             return fuzzy
826         else:
827             return 0.5
828
829     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
830         """
831         Return a PhraseQuery with a series of tokens.
832         """
833         if fuzzy:
834             phrase = MultiPhraseQuery()
835             for t in tokens:
836                 term = Term(field, t)
837                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
838                 fuzzterms = []
839
840                 while True:
841                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
842                     ft = fuzzterm.term()
843                     if ft:
844                         fuzzterms.append(ft)
845                     if not fuzzterm.next(): break
846                 if fuzzterms:
847                     phrase.add(JArray('object')(fuzzterms, Term))
848                 else:
849                     phrase.add(term)
850         else:
851             phrase = PhraseQuery()
852             phrase.setSlop(slop)
853             for t in tokens:
854                 term = Term(field, t)
855                 phrase.add(term)
856         return phrase
857
858     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
859         """
860         Returns term queries joined by boolean query.
861         modal - applies to boolean query
862         fuzzy - should the query by fuzzy.
863         """
864         q = BooleanQuery()
865         for t in tokens:
866             term = Term(field, t)
867             if fuzzy:
868                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
869             else:
870                 term = TermQuery(term)
871             q.add(BooleanClause(term, modal))
872         return q
873
874     # def content_query(self, query):
875     #     return BlockJoinQuery(query, self.parent_filter,
876     #                           BlockJoinQuery.ScoreMode.Total)
877
878     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
879         """
880         Search for perfect book matches. Just see if the query matches with some author or title,
881         taking hints into account.
882         """
883         fields_to_search = ['authors', 'title']
884         only_in = None
885         if hint:
886             if not hint.should_search_for_book():
887                 return []
888             fields_to_search = hint.just_search_in(fields_to_search)
889             only_in = hint.book_filter()
890
891         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
892
893         books = []
894         for q in qrys:
895             top = self.searcher.search(q,
896                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
897                 max_results)
898             for found in top.scoreDocs:
899                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
900         return books
901
902     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
903         fields_to_search = ['tags', 'authors', 'title']
904
905         only_in = None
906         if hint:
907             if not hint.should_search_for_book():
908                 return []
909             fields_to_search = hint.just_search_in(fields_to_search)
910             only_in = hint.book_filter()
911
912         tokens = self.get_tokens(searched, field='SIMPLE')
913
914         q = BooleanQuery()
915
916         for fld in fields_to_search:
917             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
918                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
919
920         books = []
921         top = self.searcher.search(q,
922                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
923             max_results)
924         for found in top.scoreDocs:
925             books.append(SearchResult(self.searcher, found, how_found="search_book"))
926
927         return books
928
929     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
930         """
931         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
932         some part/fragment of the book.
933         """
934         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
935
936         flt = None
937         if hint:
938             flt = hint.part_filter()
939
940         books = []
941         for q in qrys:
942             top = self.searcher.search(q,
943                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
944                                                            flt]),
945                                        max_results)
946             for found in top.scoreDocs:
947                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
948
949         return books
950
951     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
952         """
953         Tries to use search terms to match different fields of book (or its parts).
954         E.g. one word can be an author survey, another be a part of the title, and the rest
955         are some words from third chapter.
956         """
957         books = []
958         only_in = None
959
960         if hint:
961             only_in = hint.part_filter()
962
963         # content only query : themes x content
964         q = BooleanQuery()
965
966         tokens_pl = self.get_tokens(searched, field='content')
967         tokens = self.get_tokens(searched, field='SIMPLE')
968
969         # only search in themes when we do not already filter by themes
970         if hint is None or hint.just_search_in(['themes']) != []:
971             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
972                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
973
974         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
975                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
976
977         topDocs = self.searcher.search(q, only_in, max_results)
978         for found in topDocs.scoreDocs:
979             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
980             print "* %s theme x content: %s" % (searched, books[-1]._hits)
981
982         # query themes/content x author/title/tags
983         q = BooleanQuery()
984         in_content = BooleanQuery()
985         in_meta = BooleanQuery()
986
987         for fld in ['themes_pl', 'content']:
988             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
989
990         for fld in ['tags', 'authors', 'title']:
991             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
992
993         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
994         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
995
996         topDocs = self.searcher.search(q, only_in, max_results)
997         for found in topDocs.scoreDocs:
998             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
999             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1000
1001         return books
1002
1003     # def multisearch(self, query, max_results=50):
1004     #     """
1005     #     Search strategy:
1006     #     - (phrase) OR -> content
1007     #                   -> title
1008     #                   -> authors
1009     #     - (keywords)  -> authors
1010     #                   -> motyw
1011     #                   -> tags
1012     #                   -> content
1013     #     """
1014         # queryreader = StringReader(query)
1015         # tokens = self.get_tokens(queryreader)
1016
1017         # top_level = BooleanQuery()
1018         # Should = BooleanClause.Occur.SHOULD
1019
1020         # phrase_level = BooleanQuery()
1021         # phrase_level.setBoost(1.3)
1022
1023         # p_content = self.make_phrase(tokens, joined=True)
1024         # p_title = self.make_phrase(tokens, 'title')
1025         # p_author = self.make_phrase(tokens, 'author')
1026
1027         # phrase_level.add(BooleanClause(p_content, Should))
1028         # phrase_level.add(BooleanClause(p_title, Should))
1029         # phrase_level.add(BooleanClause(p_author, Should))
1030
1031         # kw_level = BooleanQuery()
1032
1033         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1034         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1035         # kw_level.add(j_themes, Should)
1036         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1037         # j_con = self.make_term_query(tokens, joined=True)
1038         # kw_level.add(j_con, Should)
1039
1040         # top_level.add(BooleanClause(phrase_level, Should))
1041         # top_level.add(BooleanClause(kw_level, Should))
1042
1043         # return None
1044
1045     def get_snippets(self, scoreDoc, query, field='content'):
1046         """
1047         Returns a snippet for found scoreDoc.
1048         """
1049         htmlFormatter = SimpleHTMLFormatter()
1050         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1051
1052         stored = self.searcher.doc(scoreDoc.doc)
1053
1054         # locate content.
1055         snippets = Snippets(stored.get('book_id')).open()
1056         try:
1057             text = snippets.get((int(stored.get('snippets_position')),
1058                                  int(stored.get('snippets_length'))))
1059         finally:
1060             snippets.close()
1061
1062         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1063         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1064         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1065
1066         return snip
1067
1068     @staticmethod
1069     def enum_to_array(enum):
1070         """
1071         Converts a lucene TermEnum to array of Terms, suitable for
1072         addition to queries
1073         """
1074         terms = []
1075
1076         while True:
1077             t = enum.term()
1078             if t:
1079                 terms.append(t)
1080             if not enum.next(): break
1081
1082         if terms:
1083             return JArray('object')(terms, Term)
1084
1085     def search_tags(self, query, filter=None, max_results=40):
1086         """
1087         Search for Tag objects using query.
1088         """
1089         tops = self.searcher.search(query, filter, max_results)
1090
1091         tags = []
1092         for found in tops.scoreDocs:
1093             doc = self.searcher.doc(found.doc)
1094             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1095             tags.append(tag)
1096             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1097
1098         return tags
1099
1100     def search_books(self, query, filter=None, max_results=10):
1101         """
1102         Searches for Book objects using query
1103         """
1104         bks = []
1105         tops = self.searcher.search(query, filter, max_results)
1106         for found in tops.scoreDocs:
1107             doc = self.searcher.doc(found.doc)
1108             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1109         return bks
1110
1111     def create_prefix_phrase(self, toks, field):
1112         q = MultiPhraseQuery()
1113         for i in range(len(toks)):
1114             t = Term(field, toks[i])
1115             if i == len(toks) - 1:
1116                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1117                 if pterms:
1118                     q.add(pterms)
1119                 else:
1120                     q.add(t)
1121             else:
1122                 q.add(t)
1123         return q
1124
1125     @staticmethod
1126     def term_filter(term, inverse=False):
1127         only_term = TermsFilter()
1128         only_term.addTerm(term)
1129
1130         if inverse:
1131             neg = BooleanFilter()
1132             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1133             only_term = neg
1134
1135         return only_term
1136
1137     def hint_tags(self, string, max_results=50):
1138         """
1139         Return auto-complete hints for tags
1140         using prefix search.
1141         """
1142         toks = self.get_tokens(string, field='SIMPLE')
1143         top = BooleanQuery()
1144
1145         for field in ['tag_name', 'tag_name_pl']:
1146             q = self.create_prefix_phrase(toks, field)
1147             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1148
1149         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1150
1151         return self.search_tags(top, no_book_cat, max_results=max_results)
1152
1153     def hint_books(self, string, max_results=50):
1154         """
1155         Returns auto-complete hints for book titles
1156         Because we do not index 'pseudo' title-tags.
1157         Prefix search.
1158         """
1159         toks = self.get_tokens(string, field='SIMPLE')
1160
1161         q = self.create_prefix_phrase(toks, 'title')
1162
1163         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1164
1165     @staticmethod
1166     def chain_filters(filters, op=ChainedFilter.AND):
1167         """
1168         Chains a filter list together
1169         """
1170         filters = filter(lambda x: x is not None, filters)
1171         if not filters:
1172             return None
1173         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1174         return chf
1175
1176     def filtered_categories(self, tags):
1177         """
1178         Return a list of tag categories, present in tags list.
1179         """
1180         cats = {}
1181         for t in tags:
1182             cats[t.category] = True
1183         return cats.keys()
1184
1185     def hint(self):
1186         return Hint(self)