Remove book from index on deletion.
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, analyzer=None, timeout=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         conf = IndexWriterConfig(Version.LUCENE_34, analyzer)
176         if timeout:
177             conf.setWriteLockTimeout(long(timeout))
178         self.index = IndexWriter(self.store, conf)
179         return self.index
180
181     def optimize(self):
182         self.index.optimize()
183
184     def close(self):
185         try:
186             self.index.optimize()
187         except JavaError, je:
188             print "Error during optimize phase, check index: %s" % je
189
190         self.index.close()
191         self.index = None
192
193     def __enter__(self):
194         self.open()
195         return self
196
197     def __exit__(self, type, value, tb):
198         self.close()
199
200
201 class Index(BaseIndex):
202     """
203     Class indexing books.
204     """
205     def __init__(self, analyzer=None):
206         super(Index, self).__init__(analyzer)
207
208     def index_tags(self):
209         """
210         Re-index global tag list.
211         Removes all tags from index, then index them again.
212         Indexed fields include: id, name (with and without polish stems), category
213         """
214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
215         self.index.deleteDocuments(q)
216
217         for tag in catalogue.models.Tag.objects.all():
218             doc = Document()
219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
223             self.index.addDocument(doc)
224
225         for pdtag in PDCounterAuthor.objects.all():
226             doc = Document()
227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
230             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
232             self.index.addDocument(doc)
233
234     def create_book_doc(self, book):
235         """
236         Create a lucene document referring book id.
237         """
238         doc = Document()
239         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
240         if book.parent is not None:
241             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
242         return doc
243
244     def remove_book(self, book):
245         """Removes a book from search index.
246         book - Book instance."""
247         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
248         self.index.deleteDocuments(q)
249
250     def index_book(self, book, book_info=None, overwrite=True):
251         """
252         Indexes the book.
253         Creates a lucene document for extracted metadata
254         and calls self.index_content() to index the contents of the book.
255         """
256         if overwrite:
257             self.remove_book(book)
258
259         book_doc = self.create_book_doc(book)
260         meta_fields = self.extract_metadata(book, book_info)
261         for f in meta_fields.values():
262             if isinstance(f, list) or isinstance(f, tuple):
263                 for elem in f:
264                     book_doc.add(elem)
265             else:
266                 book_doc.add(f)
267
268         self.index.addDocument(book_doc)
269         del book_doc
270
271         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
272
273     master_tags = [
274         'opowiadanie',
275         'powiesc',
276         'dramat_wierszowany_l',
277         'dramat_wierszowany_lp',
278         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
279         'wywiad',
280         ]
281
282     ignore_content_tags = [
283         'uwaga', 'extra',
284         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
285         'didaskalia',
286         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
287         ]
288
289     footnote_tags = ['pa', 'pt', 'pr', 'pe']
290
291     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
292
293     published_date_re = re.compile("([0-9]+)[\]. ]*$")
294
295     def extract_metadata(self, book, book_info=None):
296         """
297         Extract metadata from book and returns a map of fields keyed by fieldname
298         """
299         fields = {}
300
301         if book_info is None:
302             book_info = dcparser.parse(open(book.xml_file.path))
303
304         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
305         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
306         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
307
308         # validator, name
309         for field in dcparser.BookInfo.FIELDS:
310             if hasattr(book_info, field.name):
311                 if not getattr(book_info, field.name):
312                     continue
313                 # since no type information is available, we use validator
314                 type_indicator = field.validator
315                 if type_indicator == dcparser.as_unicode:
316                     s = getattr(book_info, field.name)
317                     if field.multiple:
318                         s = ', '.join(s)
319                     try:
320                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
321                     except JavaError as je:
322                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
323                 elif type_indicator == dcparser.as_person:
324                     p = getattr(book_info, field.name)
325                     if isinstance(p, dcparser.Person):
326                         persons = unicode(p)
327                     else:
328                         persons = ', '.join(map(unicode, p))
329                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
330                 elif type_indicator == dcparser.as_date:
331                     dt = getattr(book_info, field.name)
332                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
333                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
334
335         # get published date
336         pd = None
337         if hasattr(book_info, 'source_name') and book_info.source_name:
338             match = self.published_date_re.search(book_info.source_name)
339             if match is not None:
340                 pd = str(match.groups()[0])
341         if not pd: pd = ""
342         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
343
344         return fields
345
346     def add_gaps(self, fields, fieldname):
347         """
348         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
349         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
350         """
351         def gap():
352             while True:
353                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
354         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
355
356     def get_master(self, root):
357         """
358         Returns the first master tag from an etree.
359         """
360         for master in root.iter():
361             if master.tag in self.master_tags:
362                 return master
363
364     def index_content(self, book, book_fields=[]):
365         """
366         Walks the book XML and extract content from it.
367         Adds parts for each header tag and for each fragment.
368         """
369         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
370         root = wld.edoc.getroot()
371
372         master = self.get_master(root)
373         if master is None:
374             return []
375
376         def walker(node, ignore_tags=[]):
377
378             if node.tag not in ignore_tags:
379                 yield node, None, None
380                 if node.text is not None:
381                     yield None, node.text, None
382                 for child in list(node):
383                     for b, t, e in walker(child):
384                         yield b, t, e
385                 yield None, None, node
386
387             if node.tail is not None:
388                 yield None, node.tail, None
389             return
390
391         def fix_format(text):
392             #            separator = [u" ", u"\t", u".", u";", u","]
393             if isinstance(text, list):
394                 # need to join it first
395                 text = filter(lambda s: s is not None, content)
396                 text = u' '.join(text)
397                 # for i in range(len(text)):
398                 #     if i > 0:
399                 #         if text[i][0] not in separator\
400                 #             and text[i - 1][-1] not in separator:
401                 #          text.insert(i, u" ")
402
403             return re.sub("(?m)/$", "", text)
404
405         def add_part(snippets, **fields):
406             doc = self.create_book_doc(book)
407             for f in book_fields:
408                 doc.add(f)
409
410             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
411             doc.add(NumericField("header_span", Field.Store.YES, True)\
412                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
413             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
414
415             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
416                           Field.TermVector.WITH_POSITIONS_OFFSETS))
417
418             snip_pos = snippets.add(fields["content"])
419             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
420             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
421
422             if 'fragment_anchor' in fields:
423                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
424                               Field.Store.YES, Field.Index.NOT_ANALYZED))
425
426             if 'themes' in fields:
427                 themes, themes_pl = zip(*[
428                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
429                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
430                      for theme in fields['themes']])
431
432                 themes = self.add_gaps(themes, 'themes')
433                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
434
435                 for t in themes:
436                     doc.add(t)
437                 for t in themes_pl:
438                     doc.add(t)
439
440             return doc
441
442         def give_me_utf8(s):
443             if isinstance(s, unicode):
444                 return s.encode('utf-8')
445             else:
446                 return s
447
448         fragments = {}
449         snippets = Snippets(book.id).open('w')
450         try:
451             for header, position in zip(list(master), range(len(master))):
452
453                 if header.tag in self.skip_header_tags:
454                     continue
455                 if header.tag is etree.Comment:
456                     continue
457
458                 # section content
459                 content = []
460                 footnote = []
461
462                 def all_content(text):
463                     for frag in fragments.values():
464                         frag['content'].append(text)
465                     content.append(text)
466                 handle_text = [all_content]
467
468
469                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
470                     # handle footnotes
471                     if start is not None and start.tag in self.footnote_tags:
472                         footnote = []
473                         def collect_footnote(t):
474                             footnote.append(t)
475                         handle_text.append(collect_footnote)
476                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
477                         handle_text.pop()
478                         doc = add_part(snippets, header_index=position, header_type=header.tag,
479                                        content=u''.join(footnote),
480                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
481                 
482                         self.index.addDocument(doc)
483                         #print "@ footnote text: %s" % footnote
484                         footnote = []
485                     
486                     # handle fragments and themes.
487                     if start is not None and start.tag == 'begin':
488                         fid = start.attrib['id'][1:]
489                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
490
491                     # themes for this fragment
492                     elif start is not None and start.tag == 'motyw':
493                         fid = start.attrib['id'][1:]
494                         handle_text.append(None)
495                         if start.text is not None:
496                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
497                     elif end is not None and end.tag == 'motyw':
498                         handle_text.pop()
499
500                     elif start is not None and start.tag == 'end':
501                         fid = start.attrib['id'][1:]
502                         if fid not in fragments:
503                             continue  # a broken <end> node, skip it
504                         frag = fragments[fid]
505                         if frag['themes'] == []:
506                             continue  # empty themes list.
507                         del fragments[fid]
508
509                         doc = add_part(snippets,
510                                        header_type=frag['start_header'],
511                                        header_index=frag['start_section'],
512                                        header_span=position - frag['start_section'] + 1,
513                                        fragment_anchor=fid,
514                                        content=fix_format(frag['content']),
515                                        themes=frag['themes'])
516                         #print '@ FRAG %s' % frag['content']
517                         self.index.addDocument(doc)
518
519                         # Collect content.
520
521                     if text is not None and handle_text is not []:
522                         hdl = handle_text[-1]
523                         if hdl is not None:
524                             hdl(text)
525
526                         # in the end, add a section text.
527                 doc = add_part(snippets, header_index=position, header_type=header.tag,
528                                content=fix_format(content))
529                 #print '@ CONTENT: %s' % fix_format(content)
530
531                 self.index.addDocument(doc)
532
533         finally:
534             snippets.close()
535
536
537 def log_exception_wrapper(f):
538     def _wrap(*a):
539         try:
540             f(*a)
541         except Exception, e:
542             print("Error in indexing thread: %s" % e)
543             traceback.print_exc()
544             raise e
545     return _wrap
546
547
548 class ReusableIndex(Index):
549     """
550     Works like index, but does not close/optimize Lucene index
551     until program exit (uses atexit hook).
552     This is usefull for importbooks command.
553
554     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
555     """
556     index = None
557
558     def open(self, analyzer=None, **kw):
559         if ReusableIndex.index:
560             self.index = ReusableIndex.index
561         else:
562             print("opening index")
563             Index.open(self, analyzer, **kw)
564             ReusableIndex.index = self.index
565             atexit.register(ReusableIndex.close_reusable)
566
567     # def index_book(self, *args, **kw):
568     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
569     #     ReusableIndex.pool_jobs.append(job)
570
571     @staticmethod
572     def close_reusable():
573         if ReusableIndex.index:
574             print("closing index")
575             ReusableIndex.index.optimize()
576             ReusableIndex.index.close()
577             ReusableIndex.index = None
578
579     def close(self):
580         if ReusableIndex.index:
581             ReusableIndex.index.commit()
582
583
584 class JoinSearch(object):
585     """
586     This mixin could be used to handle block join queries.
587     (currently unused)
588     """
589     def __init__(self, *args, **kw):
590         super(JoinSearch, self).__init__(*args, **kw)
591
592     def wrapjoins(self, query, fields=[]):
593         """
594         This functions modifies the query in a recursive way,
595         so Term and Phrase Queries contained, which match
596         provided fields are wrapped in a BlockJoinQuery,
597         and so delegated to children documents.
598         """
599         if BooleanQuery.instance_(query):
600             qs = BooleanQuery.cast_(query)
601             for clause in qs:
602                 clause = BooleanClause.cast_(clause)
603                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
604             return qs
605         else:
606             termset = HashSet()
607             query.extractTerms(termset)
608             for t in termset:
609                 t = Term.cast_(t)
610                 if t.field() not in fields:
611                     return query
612             return BlockJoinQuery(query, self.parent_filter,
613                                   BlockJoinQuery.ScoreMode.Total)
614
615     def bsearch(self, query, max_results=50):
616         q = self.query(query)
617         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
618
619         tops = self.searcher.search(bjq, max_results)
620         bks = []
621         for found in tops.scoreDocs:
622             doc = self.searcher.doc(found.doc)
623             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
624         return (bks, tops.totalHits)
625
626
627 class SearchResult(object):
628     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
629         if tokens_cache is None: tokens_cache = {}
630
631         if score:
632             self._score = score
633         else:
634             self._score = scoreDocs.score
635
636         self.boost = 1.0
637
638         self._hits = []
639         self._processed_hits = None  # processed hits
640
641         stored = search.searcher.doc(scoreDocs.doc)
642         self.book_id = int(stored.get("book_id"))
643
644         pd = stored.get("published_date")
645         if pd is None:
646             pd = 0
647         self.published_date = int(pd)
648
649         header_type = stored.get("header_type")
650         # we have a content hit in some header of fragment
651         if header_type is not None:
652             sec = (header_type, int(stored.get("header_index")))
653             header_span = stored.get('header_span')
654             header_span = header_span is not None and int(header_span) or 1
655
656             fragment = stored.get("fragment_anchor")
657
658             if snippets:
659                 snippets = snippets.replace("/\n", "\n")
660             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
661
662             self._hits.append(hit)
663
664         self.search = search
665         self.searched = searched
666         self.tokens_cache = tokens_cache
667
668     @property
669     def score(self):
670         return self._score * self.boost
671
672     def merge(self, other):
673         if self.book_id != other.book_id:
674             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
675         self._hits += other._hits
676         if other.score > self.score:
677             self._score = other._score
678         return self
679
680     def get_book(self):
681         return catalogue.models.Book.objects.get(id=self.book_id)
682
683     book = property(get_book)
684
685     @property
686     def hits(self):
687         if self._processed_hits is not None:
688             return self._processed_hits
689
690         POSITION = 0
691         FRAGMENT = 1
692         POSITION_INDEX = 1
693         POSITION_SPAN = 2
694         SCORE = 2
695         OTHER = 3
696
697         # to sections and fragments
698         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
699         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
700         sect = filter(lambda s: 0 == len(filter(
701             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
702             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
703             frags)), sect)
704
705         hits = []
706
707         # remove duplicate fragments
708         fragments = {}
709         for f in frags:
710             fid = f[FRAGMENT]
711             if fid in fragments:
712                 if fragments[fid][SCORE] >= f[SCORE]:
713                     continue
714             fragments[fid] = f
715         frags = fragments.values()
716
717         # remove duplicate sections
718         sections = {}
719
720         for s in sect:
721             si = s[POSITION][POSITION_INDEX]
722             # skip existing
723             if si in sections:
724                 if sections[si]['score'] >= s[SCORE]:
725                     continue
726
727             m = {'score': s[SCORE],
728                  'section_number': s[POSITION][POSITION_INDEX] + 1,
729                  }
730             m.update(s[OTHER])
731             sections[si] = m
732
733         hits = sections.values()
734
735         for f in frags:
736             try:
737                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
738             except catalogue.models.Fragment.DoesNotExist:
739                 # stale index
740                 continue
741
742             # Figure out if we were searching for a token matching some word in theme name.
743             themes = frag.tags.filter(category='theme')
744             themes_hit = []
745             if self.searched is not None:
746                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
747                 for theme in themes:
748                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
749                     for t in tokens:
750                         if t in name_tokens:
751                             if not theme in themes_hit:
752                                 themes_hit.append(theme)
753                             break
754
755             m = {'score': f[SCORE],
756                  'fragment': frag,
757                  'section_number': f[POSITION][POSITION_INDEX] + 1,
758                  'themes': themes,
759                  'themes_hit': themes_hit
760                  }
761             m.update(f[OTHER])
762             hits.append(m)
763
764         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
765
766         self._processed_hits = hits
767
768         return hits
769
770     def __unicode__(self):
771         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
772
773     @staticmethod
774     def aggregate(*result_lists):
775         books = {}
776         for rl in result_lists:
777             for r in rl:
778                 if r.book_id in books:
779                     books[r.book_id].merge(r)
780                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
781                 else:
782                     books[r.book_id] = r
783         return books.values()
784
785     def __cmp__(self, other):
786         c = cmp(self.score, other.score)
787         if c == 0:
788             # this is inverted, because earlier date is better
789             return cmp(other.published_date, self.published_date)
790         else:
791             return c
792
793
794 class Hint(object):
795     """
796     Given some hint information (information we already know about)
797     our search target - like author, title (specific book), epoch, genre, kind
798     we can narrow down search using filters.
799     """
800     def __init__(self, search):
801         """
802         Accepts a Searcher instance.
803         """
804         self.search = search
805         self.book_tags = {}
806         self.part_tags = []
807         self._books = []
808
809     def books(self, *books):
810         """
811         Give a hint that we search these books.
812         """
813         self._books = books
814
815     def tags(self, tags):
816         """
817         Give a hint that these Tag objects (a list of)
818         is necessary.
819         """
820         for t in tags:
821             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
822                 lst = self.book_tags.get(t.category, [])
823                 lst.append(t)
824                 self.book_tags[t.category] = lst
825             if t.category in ['theme', 'theme_pl']:
826                 self.part_tags.append(t)
827
828     def tag_filter(self, tags, field='tags'):
829         """
830         Given a lsit of tags and an optional field (but they are normally in tags field)
831         returns a filter accepting only books with specific tags.
832         """
833         q = BooleanQuery()
834
835         for tag in tags:
836             toks = self.search.get_tokens(tag.name, field=field)
837             tag_phrase = PhraseQuery()
838             for tok in toks:
839                 tag_phrase.add(Term(field, tok))
840             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
841
842         return QueryWrapperFilter(q)
843
844     def book_filter(self):
845         """
846         Filters using book tags (all tag kinds except a theme)
847         """
848         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
849         if tags:
850             return self.tag_filter(tags)
851         else:
852             return None
853
854     def part_filter(self):
855         """
856         This filter can be used to look for book parts.
857         It filters on book id and/or themes.
858         """
859         fs = []
860         if self.part_tags:
861             fs.append(self.tag_filter(self.part_tags, field='themes'))
862
863         if self._books != []:
864             bf = BooleanFilter()
865             for b in self._books:
866                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
867                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
868             fs.append(bf)
869
870         return Search.chain_filters(fs)
871
872     def should_search_for_book(self):
873         return self._books == []
874
875     def just_search_in(self, all):
876         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
877         some = []
878         for field in all:
879             if field == 'authors' and 'author' in self.book_tags:
880                 continue
881             if field == 'title' and self._books != []:
882                 continue
883             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
884                 continue
885             some.append(field)
886         return some
887
888
889 class Search(IndexStore):
890     """
891     Search facilities.
892     """
893     def __init__(self, default_field="content"):
894         IndexStore.__init__(self)
895         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
896         # self.analyzer = WLAnalyzer()
897         self.searcher = IndexSearcher(self.store, True)
898         self.parser = QueryParser(Version.LUCENE_34, default_field,
899                                   self.analyzer)
900
901         self.parent_filter = TermsFilter()
902         self.parent_filter.addTerm(Term("is_book", "true"))
903
904     def query(self, query):
905         """Parse query in default Lucene Syntax. (for humans)
906         """
907         return self.parser.parse(query)
908
909     def simple_search(self, query, max_results=50):
910         """Runs a query for books using lucene syntax. (for humans)
911         Returns (books, total_hits)
912         """
913
914         tops = self.searcher.search(self.query(query), max_results)
915         bks = []
916         for found in tops.scoreDocs:
917             doc = self.searcher.doc(found.doc)
918             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
919         return (bks, tops.totalHits)
920
921     def get_tokens(self, searched, field='content', cached=None):
922         """returns tokens analyzed by a proper (for a field) analyzer
923         argument can be: StringReader, string/unicode, or tokens. In the last case
924         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
925         """
926         if cached is not None and field in cached:
927             return cached[field]
928
929         if isinstance(searched, str) or isinstance(searched, unicode):
930             searched = StringReader(searched)
931         elif isinstance(searched, list):
932             return searched
933
934         searched.reset()
935         tokens = self.analyzer.reusableTokenStream(field, searched)
936         toks = []
937         while tokens.incrementToken():
938             cta = tokens.getAttribute(CharTermAttribute.class_)
939             toks.append(cta.toString())
940
941         if cached is not None:
942             cached[field] = toks
943
944         return toks
945
946     def fuzziness(self, fuzzy):
947         """Helper method to sanitize fuzziness"""
948         if not fuzzy:
949             return None
950         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
951             return fuzzy
952         else:
953             return 0.5
954
955     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
956         """
957         Return a PhraseQuery with a series of tokens.
958         """
959         if fuzzy:
960             phrase = MultiPhraseQuery()
961             for t in tokens:
962                 term = Term(field, t)
963                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
964                 fuzzterms = []
965
966                 while True:
967                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
968                     ft = fuzzterm.term()
969                     if ft:
970                         fuzzterms.append(ft)
971                     if not fuzzterm.next(): break
972                 if fuzzterms:
973                     phrase.add(JArray('object')(fuzzterms, Term))
974                 else:
975                     phrase.add(term)
976         else:
977             phrase = PhraseQuery()
978             phrase.setSlop(slop)
979             for t in tokens:
980                 term = Term(field, t)
981                 phrase.add(term)
982         return phrase
983
984     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
985         """
986         Returns term queries joined by boolean query.
987         modal - applies to boolean query
988         fuzzy - should the query by fuzzy.
989         """
990         q = BooleanQuery()
991         for t in tokens:
992             term = Term(field, t)
993             if fuzzy:
994                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
995             else:
996                 term = TermQuery(term)
997             q.add(BooleanClause(term, modal))
998         return q
999
1000     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1001                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1002         if filters is None: filters = []
1003         if tokens_cache is None: tokens_cache = {}
1004
1005         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1006
1007         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1008         if book:
1009             filters.append(self.term_filter(Term('is_book', 'true')))
1010         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1011
1012         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1013
1014     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1015                     filters=None, tokens_cache=None, boost=None, snippets=True):
1016         if filters is None: filters = []
1017         if tokens_cache is None: tokens_cache = {}
1018
1019         if book:
1020             filters.append(self.term_filter(Term('is_book', 'true')))
1021
1022         query = BooleanQuery()
1023
1024         for fld in fields:
1025             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1026
1027             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1028                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1029
1030         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1031
1032         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1033                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1034
1035     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1036         """
1037         Search for perfect book matches. Just see if the query matches with some author or title,
1038         taking hints into account.
1039         """
1040         fields_to_search = ['authors', 'title']
1041         only_in = None
1042         if hint:
1043             if not hint.should_search_for_book():
1044                 return []
1045             fields_to_search = hint.just_search_in(fields_to_search)
1046             only_in = hint.book_filter()
1047
1048         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1049
1050         books = []
1051         for q in qrys:
1052             top = self.searcher.search(q,
1053                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1054                 max_results)
1055             for found in top.scoreDocs:
1056                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1057         return books
1058
1059     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1060         fields_to_search = ['tags', 'authors', 'title']
1061
1062         only_in = None
1063         if hint:
1064             if not hint.should_search_for_book():
1065                 return []
1066             fields_to_search = hint.just_search_in(fields_to_search)
1067             only_in = hint.book_filter()
1068
1069         tokens = self.get_tokens(searched, field='SIMPLE')
1070
1071         q = BooleanQuery()
1072
1073         for fld in fields_to_search:
1074             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1075                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1076
1077         books = []
1078         top = self.searcher.search(q,
1079                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1080             max_results)
1081         for found in top.scoreDocs:
1082             books.append(SearchResult(self, found, how_found="search_book"))
1083
1084         return books
1085
1086     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1087         """
1088         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1089         some part/fragment of the book.
1090         """
1091         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1092
1093         flt = None
1094         if hint:
1095             flt = hint.part_filter()
1096
1097         books = []
1098         for q in qrys:
1099             top = self.searcher.search(q,
1100                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1101                                                            flt]),
1102                                        max_results)
1103             for found in top.scoreDocs:
1104                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1105
1106         return books
1107
1108     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1109         """
1110         Tries to use search terms to match different fields of book (or its parts).
1111         E.g. one word can be an author survey, another be a part of the title, and the rest
1112         are some words from third chapter.
1113         """
1114         if tokens_cache is None: tokens_cache = {}
1115         books = []
1116         only_in = None
1117
1118         if hint:
1119             only_in = hint.part_filter()
1120
1121         # content only query : themes x content
1122         q = BooleanQuery()
1123
1124         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1125         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1126
1127         # only search in themes when we do not already filter by themes
1128         if hint is None or hint.just_search_in(['themes']) != []:
1129             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1130                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1131
1132         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1133                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1134
1135         topDocs = self.searcher.search(q, only_in, max_results)
1136         for found in topDocs.scoreDocs:
1137             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1138             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1139
1140         # query themes/content x author/title/tags
1141         q = BooleanQuery()
1142         in_content = BooleanQuery()
1143         in_meta = BooleanQuery()
1144
1145         for fld in ['themes_pl', 'content']:
1146             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1147
1148         for fld in ['tags', 'authors', 'title']:
1149             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1150
1151         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1152         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1153
1154         topDocs = self.searcher.search(q, only_in, max_results)
1155         for found in topDocs.scoreDocs:
1156             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1157             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1158
1159         return books
1160
1161     # def multisearch(self, query, max_results=50):
1162     #     """
1163     #     Search strategy:
1164     #     - (phrase) OR -> content
1165     #                   -> title
1166     #                   -> authors
1167     #     - (keywords)  -> authors
1168     #                   -> motyw
1169     #                   -> tags
1170     #                   -> content
1171     #     """
1172         # queryreader = StringReader(query)
1173         # tokens = self.get_tokens(queryreader)
1174
1175         # top_level = BooleanQuery()
1176         # Should = BooleanClause.Occur.SHOULD
1177
1178         # phrase_level = BooleanQuery()
1179         # phrase_level.setBoost(1.3)
1180
1181         # p_content = self.make_phrase(tokens, joined=True)
1182         # p_title = self.make_phrase(tokens, 'title')
1183         # p_author = self.make_phrase(tokens, 'author')
1184
1185         # phrase_level.add(BooleanClause(p_content, Should))
1186         # phrase_level.add(BooleanClause(p_title, Should))
1187         # phrase_level.add(BooleanClause(p_author, Should))
1188
1189         # kw_level = BooleanQuery()
1190
1191         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1192         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1193         # kw_level.add(j_themes, Should)
1194         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1195         # j_con = self.make_term_query(tokens, joined=True)
1196         # kw_level.add(j_con, Should)
1197
1198         # top_level.add(BooleanClause(phrase_level, Should))
1199         # top_level.add(BooleanClause(kw_level, Should))
1200
1201         # return None
1202
1203     def get_snippets(self, scoreDoc, query, field='content'):
1204         """
1205         Returns a snippet for found scoreDoc.
1206         """
1207         htmlFormatter = SimpleHTMLFormatter()
1208         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1209
1210         stored = self.searcher.doc(scoreDoc.doc)
1211
1212         position = stored.get('snippets_position')
1213         length = stored.get('snippets_length')
1214         if position is None or length is None:
1215             return None
1216         # locate content.
1217         book_id = int(stored.get('book_id'))
1218         snippets = Snippets(book_id).open()
1219         try:
1220             try:
1221                 text = snippets.get((int(position),
1222                                      int(length)))
1223             finally:
1224                 snippets.close()
1225
1226             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1227             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1228             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1229
1230         except Exception, e:
1231             e2 = e
1232             if hasattr(e, 'getJavaException'):
1233                 e2 = unicode(e.getJavaException())
1234             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1235                 e2)
1236         return snip
1237
1238     @staticmethod
1239     def enum_to_array(enum):
1240         """
1241         Converts a lucene TermEnum to array of Terms, suitable for
1242         addition to queries
1243         """
1244         terms = []
1245
1246         while True:
1247             t = enum.term()
1248             if t:
1249                 terms.append(t)
1250             if not enum.next(): break
1251
1252         if terms:
1253             return JArray('object')(terms, Term)
1254
1255     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1256         """
1257         Search for Tag objects using query.
1258         """
1259         if not pdcounter:
1260             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1261         tops = self.searcher.search(query, filters, max_results)
1262
1263         tags = []
1264         for found in tops.scoreDocs:
1265             doc = self.searcher.doc(found.doc)
1266             is_pdcounter = doc.get('is_pdcounter')
1267             if is_pdcounter:
1268                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1269             else:
1270                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1271                 # don't add the pdcounter tag if same tag already exists
1272             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1273                 tags.append(tag)
1274                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1275         print 'returning %s' % tags
1276         return tags
1277
1278     def search_books(self, query, filter=None, max_results=10):
1279         """
1280         Searches for Book objects using query
1281         """
1282         bks = []
1283         tops = self.searcher.search(query, filter, max_results)
1284         for found in tops.scoreDocs:
1285             doc = self.searcher.doc(found.doc)
1286             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1287         return bks
1288
1289     def make_prefix_phrase(self, toks, field):
1290         q = MultiPhraseQuery()
1291         for i in range(len(toks)):
1292             t = Term(field, toks[i])
1293             if i == len(toks) - 1:
1294                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1295                 if pterms:
1296                     q.add(pterms)
1297                 else:
1298                     q.add(t)
1299             else:
1300                 q.add(t)
1301         return q
1302
1303     @staticmethod
1304     def term_filter(term, inverse=False):
1305         only_term = TermsFilter()
1306         only_term.addTerm(term)
1307
1308         if inverse:
1309             neg = BooleanFilter()
1310             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1311             only_term = neg
1312
1313         return only_term
1314
1315     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1316         """
1317         Return auto-complete hints for tags
1318         using prefix search.
1319         """
1320         toks = self.get_tokens(string, field='SIMPLE')
1321         top = BooleanQuery()
1322
1323         for field in ['tag_name', 'tag_name_pl']:
1324             if prefix:
1325                 q = self.make_prefix_phrase(toks, field)
1326             else:
1327                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1328             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1329
1330         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1331
1332         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1333
1334     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1335         """
1336         Returns auto-complete hints for book titles
1337         Because we do not index 'pseudo' title-tags.
1338         Prefix search.
1339         """
1340         toks = self.get_tokens(string, field='SIMPLE')
1341
1342         if prefix:
1343             q = self.make_prefix_phrase(toks, 'title')
1344         else:
1345             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1346
1347         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1348
1349     @staticmethod
1350     def chain_filters(filters, op=ChainedFilter.AND):
1351         """
1352         Chains a filter list together
1353         """
1354         filters = filter(lambda x: x is not None, filters)
1355         if not filters or filters is []:
1356             return None
1357         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1358         return chf
1359
1360     def filtered_categories(self, tags):
1361         """
1362         Return a list of tag categories, present in tags list.
1363         """
1364         cats = {}
1365         for t in tags:
1366             cats[t.category] = True
1367         return cats.keys()
1368
1369     def hint(self):
1370         return Hint(self)