71c0ed2369a758c4c746dd343d2b3d8d651dd4b2
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, analyzer=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         self.index = IndexWriter(self.store, self.analyzer,\
176                                  IndexWriter.MaxFieldLength.LIMITED)
177         return self.index
178
179     def optimize(self):
180         self.index.optimize()
181
182     def close(self):
183         try:
184             self.index.optimize()
185         except JavaError, je:
186             print "Error during optimize phase, check index: %s" % je
187
188         self.index.close()
189         self.index = None
190
191     def __enter__(self):
192         self.open()
193         return self
194
195     def __exit__(self, type, value, tb):
196         self.close()
197
198
199 class Index(BaseIndex):
200     """
201     Class indexing books.
202     """
203     def __init__(self, analyzer=None):
204         super(Index, self).__init__(analyzer)
205
206     def index_tags(self):
207         """
208         Re-index global tag list.
209         Removes all tags from index, then index them again.
210         Indexed fields include: id, name (with and without polish stems), category
211         """
212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213         self.index.deleteDocuments(q)
214
215         for tag in catalogue.models.Tag.objects.all():
216             doc = Document()
217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221             self.index.addDocument(doc)
222
223         for pdtag in PDCounterAuthor.objects.all():
224             doc = Document()
225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230             self.index.addDocument(doc)
231
232     def create_book_doc(self, book):
233         """
234         Create a lucene document referring book id.
235         """
236         doc = Document()
237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238         if book.parent is not None:
239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
240         return doc
241
242     def remove_book(self, book):
243         """Removes a book from search index.
244         book - Book instance."""
245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246         self.index.deleteDocuments(q)
247
248     def index_book(self, book, book_info=None, overwrite=True):
249         """
250         Indexes the book.
251         Creates a lucene document for extracted metadata
252         and calls self.index_content() to index the contents of the book.
253         """
254         if overwrite:
255             self.remove_book(book)
256
257         book_doc = self.create_book_doc(book)
258         meta_fields = self.extract_metadata(book, book_info)
259         for f in meta_fields.values():
260             if isinstance(f, list) or isinstance(f, tuple):
261                 for elem in f:
262                     book_doc.add(elem)
263             else:
264                 book_doc.add(f)
265
266         self.index.addDocument(book_doc)
267         del book_doc
268
269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
270
271     master_tags = [
272         'opowiadanie',
273         'powiesc',
274         'dramat_wierszowany_l',
275         'dramat_wierszowany_lp',
276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
277         'wywiad',
278         ]
279
280     ignore_content_tags = [
281         'uwaga', 'extra',
282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
283         'didaskalia',
284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
285         ]
286
287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
288
289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
290
291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
292
293     def extract_metadata(self, book, book_info=None):
294         """
295         Extract metadata from book and returns a map of fields keyed by fieldname
296         """
297         fields = {}
298
299         if book_info is None:
300             book_info = dcparser.parse(open(book.xml_file.path))
301
302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
305
306         # validator, name
307         for field in dcparser.BookInfo.FIELDS:
308             if hasattr(book_info, field.name):
309                 if not getattr(book_info, field.name):
310                     continue
311                 # since no type information is available, we use validator
312                 type_indicator = field.validator
313                 if type_indicator == dcparser.as_unicode:
314                     s = getattr(book_info, field.name)
315                     if field.multiple:
316                         s = ', '.join(s)
317                     try:
318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319                     except JavaError as je:
320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321                 elif type_indicator == dcparser.as_person:
322                     p = getattr(book_info, field.name)
323                     if isinstance(p, dcparser.Person):
324                         persons = unicode(p)
325                     else:
326                         persons = ', '.join(map(unicode, p))
327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328                 elif type_indicator == dcparser.as_date:
329                     dt = getattr(book_info, field.name)
330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
332
333         # get published date
334         source = book_info.source_name
335         if hasattr(book_info, 'source_name'):
336             match = self.published_date_re.search(source)
337             if match is not None:
338                 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
339
340         return fields
341
342     def add_gaps(self, fields, fieldname):
343         """
344         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
345         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
346         """
347         def gap():
348             while True:
349                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
350         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
351
352     def get_master(self, root):
353         """
354         Returns the first master tag from an etree.
355         """
356         for master in root.iter():
357             if master.tag in self.master_tags:
358                 return master
359
360     def index_content(self, book, book_fields=[]):
361         """
362         Walks the book XML and extract content from it.
363         Adds parts for each header tag and for each fragment.
364         """
365         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
366         root = wld.edoc.getroot()
367
368         master = self.get_master(root)
369         if master is None:
370             return []
371
372         def walker(node, ignore_tags=[]):
373
374             if node.tag not in ignore_tags:
375                 yield node, None, None
376                 if node.text is not None:
377                     yield None, node.text, None
378                 for child in list(node):
379                     for b, t, e in walker(child):
380                         yield b, t, e
381                 yield None, None, node
382
383             if node.tail is not None:
384                 yield None, node.tail, None
385             return
386
387         def fix_format(text):
388             #            separator = [u" ", u"\t", u".", u";", u","]
389             if isinstance(text, list):
390                 # need to join it first
391                 text = filter(lambda s: s is not None, content)
392                 text = u' '.join(text)
393                 # for i in range(len(text)):
394                 #     if i > 0:
395                 #         if text[i][0] not in separator\
396                 #             and text[i - 1][-1] not in separator:
397                 #          text.insert(i, u" ")
398
399             return re.sub("(?m)/$", "", text)
400
401         def add_part(snippets, **fields):
402             doc = self.create_book_doc(book)
403             for f in book_fields:
404                 doc.add(f)
405
406             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
407             doc.add(NumericField("header_span", Field.Store.YES, True)\
408                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
409             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
410
411             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
412                           Field.TermVector.WITH_POSITIONS_OFFSETS))
413
414             snip_pos = snippets.add(fields["content"])
415             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
416             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
417
418             if 'fragment_anchor' in fields:
419                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
420                               Field.Store.YES, Field.Index.NOT_ANALYZED))
421
422             if 'themes' in fields:
423                 themes, themes_pl = zip(*[
424                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
425                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
426                      for theme in fields['themes']])
427
428                 themes = self.add_gaps(themes, 'themes')
429                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
430
431                 for t in themes:
432                     doc.add(t)
433                 for t in themes_pl:
434                     doc.add(t)
435
436             return doc
437
438         def give_me_utf8(s):
439             if isinstance(s, unicode):
440                 return s.encode('utf-8')
441             else:
442                 return s
443
444         fragments = {}
445         snippets = Snippets(book.id).open('w')
446         try:
447             for header, position in zip(list(master), range(len(master))):
448
449                 if header.tag in self.skip_header_tags:
450                     continue
451                 if header.tag is etree.Comment:
452                     continue
453
454                 # section content
455                 content = []
456                 footnote = []
457
458                 def all_content(text):
459                     for frag in fragments.values():
460                         frag['content'].append(text)
461                     content.append(text)
462                 handle_text = [all_content]
463
464
465                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
466                     # handle footnotes
467                     if start is not None and start.tag in self.footnote_tags:
468                         footnote = []
469                         def collect_footnote(t):
470                             footnote.append(t)
471                         handle_text.append(collect_footnote)
472                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
473                         handle_text.pop()
474                         doc = add_part(snippets, header_index=position, header_type=header.tag,
475                                        content=u''.join(footnote),
476                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
477                 
478                         self.index.addDocument(doc)
479                         print "@ footnote text: %s" % footnote
480                         footnote = []
481                     
482                     # handle fragments and themes.
483                     if start is not None and start.tag == 'begin':
484                         fid = start.attrib['id'][1:]
485                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
486
487                     # themes for this fragment
488                     elif start is not None and start.tag == 'motyw':
489                         fid = start.attrib['id'][1:]
490                         handle_text.append(None)
491                         if start.text is not None:
492                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
493                     elif end is not None and end.tag == 'motyw':
494                         handle_text.pop()
495
496                     elif start is not None and start.tag == 'end':
497                         fid = start.attrib['id'][1:]
498                         if fid not in fragments:
499                             continue  # a broken <end> node, skip it
500                         frag = fragments[fid]
501                         if frag['themes'] == []:
502                             continue  # empty themes list.
503                         del fragments[fid]
504
505                         doc = add_part(snippets,
506                                        header_type=frag['start_header'],
507                                        header_index=frag['start_section'],
508                                        header_span=position - frag['start_section'] + 1,
509                                        fragment_anchor=fid,
510                                        content=fix_format(frag['content']),
511                                        themes=frag['themes'])
512                         print '@ FRAG %s' % frag['content']
513                         self.index.addDocument(doc)
514
515                         # Collect content.
516
517                     if text is not None and handle_text is not []:
518                         hdl = handle_text[-1]
519                         if hdl is not None:
520                             hdl(text)
521
522                         # in the end, add a section text.
523                 doc = add_part(snippets, header_index=position, header_type=header.tag,
524                                content=fix_format(content))
525                 print '@ CONTENT: %s' % fix_format(content)
526
527                 self.index.addDocument(doc)
528
529         finally:
530             snippets.close()
531
532
533 def log_exception_wrapper(f):
534     def _wrap(*a):
535         try:
536             f(*a)
537         except Exception, e:
538             print("Error in indexing thread: %s" % e)
539             traceback.print_exc()
540             raise e
541     return _wrap
542
543
544 class ReusableIndex(Index):
545     """
546     Works like index, but does not close/optimize Lucene index
547     until program exit (uses atexit hook).
548     This is usefull for importbooks command.
549
550     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
551     """
552     index = None
553
554     def open(self, analyzer=None, threads=4):
555         if ReusableIndex.index is not None:
556             self.index = ReusableIndex.index
557         else:
558             print("opening index")
559             Index.open(self, analyzer)
560             ReusableIndex.index = self.index
561             atexit.register(ReusableIndex.close_reusable)
562
563     # def index_book(self, *args, **kw):
564     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
565     #     ReusableIndex.pool_jobs.append(job)
566
567     @staticmethod
568     def close_reusable():
569         if ReusableIndex.index is not None:
570             ReusableIndex.index.optimize()
571             ReusableIndex.index.close()
572             ReusableIndex.index = None
573
574     def close(self):
575         pass
576
577
578 class JoinSearch(object):
579     """
580     This mixin could be used to handle block join queries.
581     (currently unused)
582     """
583     def __init__(self, *args, **kw):
584         super(JoinSearch, self).__init__(*args, **kw)
585
586     def wrapjoins(self, query, fields=[]):
587         """
588         This functions modifies the query in a recursive way,
589         so Term and Phrase Queries contained, which match
590         provided fields are wrapped in a BlockJoinQuery,
591         and so delegated to children documents.
592         """
593         if BooleanQuery.instance_(query):
594             qs = BooleanQuery.cast_(query)
595             for clause in qs:
596                 clause = BooleanClause.cast_(clause)
597                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
598             return qs
599         else:
600             termset = HashSet()
601             query.extractTerms(termset)
602             for t in termset:
603                 t = Term.cast_(t)
604                 if t.field() not in fields:
605                     return query
606             return BlockJoinQuery(query, self.parent_filter,
607                                   BlockJoinQuery.ScoreMode.Total)
608
609     def bsearch(self, query, max_results=50):
610         q = self.query(query)
611         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
612
613         tops = self.searcher.search(bjq, max_results)
614         bks = []
615         for found in tops.scoreDocs:
616             doc = self.searcher.doc(found.doc)
617             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
618         return (bks, tops.totalHits)
619
620
621 class SearchResult(object):
622     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
623         if tokens_cache is None: tokens_cache = {}
624
625         if score:
626             self._score = score
627         else:
628             self._score = scoreDocs.score
629
630         self.boost = 1.0
631
632         self._hits = []
633         self._processed_hits = None  # processed hits
634
635         stored = search.searcher.doc(scoreDocs.doc)
636         self.book_id = int(stored.get("book_id"))
637
638         pd = stored.get("published_date")
639         if pd is None:
640             pd = 0
641         self.published_date = int(pd)
642
643         header_type = stored.get("header_type")
644         # we have a content hit in some header of fragment
645         if header_type is not None:
646             sec = (header_type, int(stored.get("header_index")))
647             header_span = stored.get('header_span')
648             header_span = header_span is not None and int(header_span) or 1
649
650             fragment = stored.get("fragment_anchor")
651
652             if snippets:
653                 snippets = snippets.replace("/\n", "\n")
654             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
655
656             self._hits.append(hit)
657
658         self.search = search
659         self.searched = searched
660         self.tokens_cache = tokens_cache
661
662     @property
663     def score(self):
664         return self._score * self.boost
665
666     def merge(self, other):
667         if self.book_id != other.book_id:
668             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
669         self._hits += other._hits
670         if other.score > self.score:
671             self._score = other._score
672         return self
673
674     def get_book(self):
675         return catalogue.models.Book.objects.get(id=self.book_id)
676
677     book = property(get_book)
678
679     @property
680     def hits(self):
681         if self._processed_hits is not None:
682             return self._processed_hits
683
684         POSITION = 0
685         FRAGMENT = 1
686         POSITION_INDEX = 1
687         POSITION_SPAN = 2
688         SCORE = 2
689         OTHER = 3
690
691         # to sections and fragments
692         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
693         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
694         sect = filter(lambda s: 0 == len(filter(
695             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
696             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
697             frags)), sect)
698
699         hits = []
700
701         # remove duplicate fragments
702         fragments = {}
703         for f in frags:
704             fid = f[FRAGMENT]
705             if fid in fragments:
706                 if fragments[fid][SCORE] >= f[SCORE]:
707                     continue
708             fragments[fid] = f
709         frags = fragments.values()
710
711         # remove duplicate sections
712         sections = {}
713
714         for s in sect:
715             si = s[POSITION][POSITION_INDEX]
716             # skip existing
717             if si in sections:
718                 if sections[si]['score'] >= s[SCORE]:
719                     continue
720
721             m = {'score': s[SCORE],
722                  'section_number': s[POSITION][POSITION_INDEX] + 1,
723                  }
724             m.update(s[OTHER])
725             sections[si] = m
726
727         hits = sections.values()
728
729         for f in frags:
730             try:
731                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
732             except catalogue.models.Fragment.DoesNotExist:
733                 # stale index
734                 continue
735
736             # Figure out if we were searching for a token matching some word in theme name.
737             themes = frag.tags.filter(category='theme')
738             themes_hit = []
739             if self.searched is not None:
740                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
741                 for theme in themes:
742                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
743                     for t in tokens:
744                         if t in name_tokens:
745                             if not theme in themes_hit:
746                                 themes_hit.append(theme)
747                             break
748
749             m = {'score': f[SCORE],
750                  'fragment': frag,
751                  'section_number': f[POSITION][POSITION_INDEX] + 1,
752                  'themes': themes,
753                  'themes_hit': themes_hit
754                  }
755             m.update(f[OTHER])
756             hits.append(m)
757
758         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
759
760         self._processed_hits = hits
761
762         return hits
763
764     def __unicode__(self):
765         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
766
767     @staticmethod
768     def aggregate(*result_lists):
769         books = {}
770         for rl in result_lists:
771             for r in rl:
772                 if r.book_id in books:
773                     books[r.book_id].merge(r)
774                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
775                 else:
776                     books[r.book_id] = r
777         return books.values()
778
779     def __cmp__(self, other):
780         c = cmp(self.score, other.score)
781         if c == 0:
782             # this is inverted, because earlier date is better
783             return cmp(other.published_date, self.published_date)
784         else:
785             return c
786
787
788 class Hint(object):
789     """
790     Given some hint information (information we already know about)
791     our search target - like author, title (specific book), epoch, genre, kind
792     we can narrow down search using filters.
793     """
794     def __init__(self, search):
795         """
796         Accepts a Searcher instance.
797         """
798         self.search = search
799         self.book_tags = {}
800         self.part_tags = []
801         self._books = []
802
803     def books(self, *books):
804         """
805         Give a hint that we search these books.
806         """
807         self._books = books
808
809     def tags(self, tags):
810         """
811         Give a hint that these Tag objects (a list of)
812         is necessary.
813         """
814         for t in tags:
815             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
816                 lst = self.book_tags.get(t.category, [])
817                 lst.append(t)
818                 self.book_tags[t.category] = lst
819             if t.category in ['theme', 'theme_pl']:
820                 self.part_tags.append(t)
821
822     def tag_filter(self, tags, field='tags'):
823         """
824         Given a lsit of tags and an optional field (but they are normally in tags field)
825         returns a filter accepting only books with specific tags.
826         """
827         q = BooleanQuery()
828
829         for tag in tags:
830             toks = self.search.get_tokens(tag.name, field=field)
831             tag_phrase = PhraseQuery()
832             for tok in toks:
833                 tag_phrase.add(Term(field, tok))
834             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
835
836         return QueryWrapperFilter(q)
837
838     def book_filter(self):
839         """
840         Filters using book tags (all tag kinds except a theme)
841         """
842         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
843         if tags:
844             return self.tag_filter(tags)
845         else:
846             return None
847
848     def part_filter(self):
849         """
850         This filter can be used to look for book parts.
851         It filters on book id and/or themes.
852         """
853         fs = []
854         if self.part_tags:
855             fs.append(self.tag_filter(self.part_tags, field='themes'))
856
857         if self._books != []:
858             bf = BooleanFilter()
859             for b in self._books:
860                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
861                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
862             fs.append(bf)
863
864         return Search.chain_filters(fs)
865
866     def should_search_for_book(self):
867         return self._books == []
868
869     def just_search_in(self, all):
870         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
871         some = []
872         for field in all:
873             if field == 'authors' and 'author' in self.book_tags:
874                 continue
875             if field == 'title' and self._books != []:
876                 continue
877             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
878                 continue
879             some.append(field)
880         return some
881
882
883 class Search(IndexStore):
884     """
885     Search facilities.
886     """
887     def __init__(self, default_field="content"):
888         IndexStore.__init__(self)
889         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
890         # self.analyzer = WLAnalyzer()
891         self.searcher = IndexSearcher(self.store, True)
892         self.parser = QueryParser(Version.LUCENE_34, default_field,
893                                   self.analyzer)
894
895         self.parent_filter = TermsFilter()
896         self.parent_filter.addTerm(Term("is_book", "true"))
897
898     def query(self, query):
899         """Parse query in default Lucene Syntax. (for humans)
900         """
901         return self.parser.parse(query)
902
903     def simple_search(self, query, max_results=50):
904         """Runs a query for books using lucene syntax. (for humans)
905         Returns (books, total_hits)
906         """
907
908         tops = self.searcher.search(self.query(query), max_results)
909         bks = []
910         for found in tops.scoreDocs:
911             doc = self.searcher.doc(found.doc)
912             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
913         return (bks, tops.totalHits)
914
915     def get_tokens(self, searched, field='content', cached=None):
916         """returns tokens analyzed by a proper (for a field) analyzer
917         argument can be: StringReader, string/unicode, or tokens. In the last case
918         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
919         """
920         if cached is not None and field in cached:
921             return cached[field]
922
923         if isinstance(searched, str) or isinstance(searched, unicode):
924             searched = StringReader(searched)
925         elif isinstance(searched, list):
926             return searched
927
928         searched.reset()
929         tokens = self.analyzer.reusableTokenStream(field, searched)
930         toks = []
931         while tokens.incrementToken():
932             cta = tokens.getAttribute(CharTermAttribute.class_)
933             toks.append(cta.toString())
934
935         if cached is not None:
936             cached[field] = toks
937
938         return toks
939
940     def fuzziness(self, fuzzy):
941         """Helper method to sanitize fuzziness"""
942         if not fuzzy:
943             return None
944         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
945             return fuzzy
946         else:
947             return 0.5
948
949     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
950         """
951         Return a PhraseQuery with a series of tokens.
952         """
953         if fuzzy:
954             phrase = MultiPhraseQuery()
955             for t in tokens:
956                 term = Term(field, t)
957                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
958                 fuzzterms = []
959
960                 while True:
961                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
962                     ft = fuzzterm.term()
963                     if ft:
964                         fuzzterms.append(ft)
965                     if not fuzzterm.next(): break
966                 if fuzzterms:
967                     phrase.add(JArray('object')(fuzzterms, Term))
968                 else:
969                     phrase.add(term)
970         else:
971             phrase = PhraseQuery()
972             phrase.setSlop(slop)
973             for t in tokens:
974                 term = Term(field, t)
975                 phrase.add(term)
976         return phrase
977
978     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
979         """
980         Returns term queries joined by boolean query.
981         modal - applies to boolean query
982         fuzzy - should the query by fuzzy.
983         """
984         q = BooleanQuery()
985         for t in tokens:
986             term = Term(field, t)
987             if fuzzy:
988                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
989             else:
990                 term = TermQuery(term)
991             q.add(BooleanClause(term, modal))
992         return q
993
994     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
995                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
996         if filters is None: filters = []
997         if tokens_cache is None: tokens_cache = {}
998
999         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1000
1001         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1002         if book:
1003             filters.append(self.term_filter(Term('is_book', 'true')))
1004         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1005
1006         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1007
1008     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1009                     filters=None, tokens_cache=None, boost=None, snippets=True):
1010         if filters is None: filters = []
1011         if tokens_cache is None: tokens_cache = {}
1012
1013         if book:
1014             filters.append(self.term_filter(Term('is_book', 'true')))
1015
1016         query = BooleanQuery()
1017
1018         for fld in fields:
1019             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1020
1021             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1022                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1023
1024         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1025
1026         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1027                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1028
1029     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1030         """
1031         Search for perfect book matches. Just see if the query matches with some author or title,
1032         taking hints into account.
1033         """
1034         fields_to_search = ['authors', 'title']
1035         only_in = None
1036         if hint:
1037             if not hint.should_search_for_book():
1038                 return []
1039             fields_to_search = hint.just_search_in(fields_to_search)
1040             only_in = hint.book_filter()
1041
1042         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1043
1044         books = []
1045         for q in qrys:
1046             top = self.searcher.search(q,
1047                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1048                 max_results)
1049             for found in top.scoreDocs:
1050                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1051         return books
1052
1053     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1054         fields_to_search = ['tags', 'authors', 'title']
1055
1056         only_in = None
1057         if hint:
1058             if not hint.should_search_for_book():
1059                 return []
1060             fields_to_search = hint.just_search_in(fields_to_search)
1061             only_in = hint.book_filter()
1062
1063         tokens = self.get_tokens(searched, field='SIMPLE')
1064
1065         q = BooleanQuery()
1066
1067         for fld in fields_to_search:
1068             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1069                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1070
1071         books = []
1072         top = self.searcher.search(q,
1073                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1074             max_results)
1075         for found in top.scoreDocs:
1076             books.append(SearchResult(self, found, how_found="search_book"))
1077
1078         return books
1079
1080     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1081         """
1082         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1083         some part/fragment of the book.
1084         """
1085         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1086
1087         flt = None
1088         if hint:
1089             flt = hint.part_filter()
1090
1091         books = []
1092         for q in qrys:
1093             top = self.searcher.search(q,
1094                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1095                                                            flt]),
1096                                        max_results)
1097             for found in top.scoreDocs:
1098                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1099
1100         return books
1101
1102     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1103         """
1104         Tries to use search terms to match different fields of book (or its parts).
1105         E.g. one word can be an author survey, another be a part of the title, and the rest
1106         are some words from third chapter.
1107         """
1108         if tokens_cache is None: tokens_cache = {}
1109         books = []
1110         only_in = None
1111
1112         if hint:
1113             only_in = hint.part_filter()
1114
1115         # content only query : themes x content
1116         q = BooleanQuery()
1117
1118         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1119         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1120
1121         # only search in themes when we do not already filter by themes
1122         if hint is None or hint.just_search_in(['themes']) != []:
1123             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1124                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1125
1126         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1127                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1128
1129         topDocs = self.searcher.search(q, only_in, max_results)
1130         for found in topDocs.scoreDocs:
1131             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1132             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1133
1134         # query themes/content x author/title/tags
1135         q = BooleanQuery()
1136         in_content = BooleanQuery()
1137         in_meta = BooleanQuery()
1138
1139         for fld in ['themes_pl', 'content']:
1140             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1141
1142         for fld in ['tags', 'authors', 'title']:
1143             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1144
1145         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1146         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1147
1148         topDocs = self.searcher.search(q, only_in, max_results)
1149         for found in topDocs.scoreDocs:
1150             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1151             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1152
1153         return books
1154
1155     # def multisearch(self, query, max_results=50):
1156     #     """
1157     #     Search strategy:
1158     #     - (phrase) OR -> content
1159     #                   -> title
1160     #                   -> authors
1161     #     - (keywords)  -> authors
1162     #                   -> motyw
1163     #                   -> tags
1164     #                   -> content
1165     #     """
1166         # queryreader = StringReader(query)
1167         # tokens = self.get_tokens(queryreader)
1168
1169         # top_level = BooleanQuery()
1170         # Should = BooleanClause.Occur.SHOULD
1171
1172         # phrase_level = BooleanQuery()
1173         # phrase_level.setBoost(1.3)
1174
1175         # p_content = self.make_phrase(tokens, joined=True)
1176         # p_title = self.make_phrase(tokens, 'title')
1177         # p_author = self.make_phrase(tokens, 'author')
1178
1179         # phrase_level.add(BooleanClause(p_content, Should))
1180         # phrase_level.add(BooleanClause(p_title, Should))
1181         # phrase_level.add(BooleanClause(p_author, Should))
1182
1183         # kw_level = BooleanQuery()
1184
1185         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1186         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1187         # kw_level.add(j_themes, Should)
1188         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1189         # j_con = self.make_term_query(tokens, joined=True)
1190         # kw_level.add(j_con, Should)
1191
1192         # top_level.add(BooleanClause(phrase_level, Should))
1193         # top_level.add(BooleanClause(kw_level, Should))
1194
1195         # return None
1196
1197     def get_snippets(self, scoreDoc, query, field='content'):
1198         """
1199         Returns a snippet for found scoreDoc.
1200         """
1201         htmlFormatter = SimpleHTMLFormatter()
1202         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1203
1204         stored = self.searcher.doc(scoreDoc.doc)
1205
1206         position = stored.get('snippets_position')
1207         length = stored.get('snippets_length')
1208         if position is None or length is None:
1209             return None
1210         # locate content.
1211         snippets = Snippets(stored.get('book_id')).open()
1212         try:
1213             text = snippets.get((int(position),
1214                                  int(length)))
1215         finally:
1216             snippets.close()
1217
1218         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1219         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1220         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1221
1222         return snip
1223
1224     @staticmethod
1225     def enum_to_array(enum):
1226         """
1227         Converts a lucene TermEnum to array of Terms, suitable for
1228         addition to queries
1229         """
1230         terms = []
1231
1232         while True:
1233             t = enum.term()
1234             if t:
1235                 terms.append(t)
1236             if not enum.next(): break
1237
1238         if terms:
1239             return JArray('object')(terms, Term)
1240
1241     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1242         """
1243         Search for Tag objects using query.
1244         """
1245         if not pdcounter:
1246             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1247         tops = self.searcher.search(query, filters, max_results)
1248
1249         tags = []
1250         for found in tops.scoreDocs:
1251             doc = self.searcher.doc(found.doc)
1252             is_pdcounter = doc.get('is_pdcounter')
1253             if is_pdcounter:
1254                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1255             else:
1256                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1257                 # don't add the pdcounter tag if same tag already exists
1258             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1259                 tags.append(tag)
1260                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1261         print 'returning %s' % tags
1262         return tags
1263
1264     def search_books(self, query, filter=None, max_results=10):
1265         """
1266         Searches for Book objects using query
1267         """
1268         bks = []
1269         tops = self.searcher.search(query, filter, max_results)
1270         for found in tops.scoreDocs:
1271             doc = self.searcher.doc(found.doc)
1272             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1273         return bks
1274
1275     def make_prefix_phrase(self, toks, field):
1276         q = MultiPhraseQuery()
1277         for i in range(len(toks)):
1278             t = Term(field, toks[i])
1279             if i == len(toks) - 1:
1280                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1281                 if pterms:
1282                     q.add(pterms)
1283                 else:
1284                     q.add(t)
1285             else:
1286                 q.add(t)
1287         return q
1288
1289     @staticmethod
1290     def term_filter(term, inverse=False):
1291         only_term = TermsFilter()
1292         only_term.addTerm(term)
1293
1294         if inverse:
1295             neg = BooleanFilter()
1296             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1297             only_term = neg
1298
1299         return only_term
1300
1301     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1302         """
1303         Return auto-complete hints for tags
1304         using prefix search.
1305         """
1306         toks = self.get_tokens(string, field='SIMPLE')
1307         top = BooleanQuery()
1308
1309         for field in ['tag_name', 'tag_name_pl']:
1310             if prefix:
1311                 q = self.make_prefix_phrase(toks, field)
1312             else:
1313                 q = self.make_term_query(toks, field)
1314             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1315
1316         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1317
1318         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1319
1320     def hint_books(self, string, max_results=50, prefix=True):
1321         """
1322         Returns auto-complete hints for book titles
1323         Because we do not index 'pseudo' title-tags.
1324         Prefix search.
1325         """
1326         toks = self.get_tokens(string, field='SIMPLE')
1327
1328         if prefix:
1329             q = self.make_prefix_phrase(toks, 'title')
1330         else:
1331             q = self.make_term_query(toks, 'title')
1332
1333         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1334
1335     @staticmethod
1336     def chain_filters(filters, op=ChainedFilter.AND):
1337         """
1338         Chains a filter list together
1339         """
1340         filters = filter(lambda x: x is not None, filters)
1341         if not filters or filters is []:
1342             return None
1343         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1344         return chf
1345
1346     def filtered_categories(self, tags):
1347         """
1348         Return a list of tag categories, present in tags list.
1349         """
1350         cats = {}
1351         for t in tags:
1352             cats[t.category] = True
1353         return cats.keys()
1354
1355     def hint(self):
1356         return Hint(self)