some speedups for batch indexing
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, analyzer=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         self.index = IndexWriter(self.store, self.analyzer,\
176                                  IndexWriter.MaxFieldLength.LIMITED)
177         return self.index
178
179     def optimize(self):
180         self.index.optimize()
181
182     def close(self):
183         try:
184             self.index.optimize()
185         except JavaError, je:
186             print "Error during optimize phase, check index: %s" % je
187
188         self.index.close()
189         self.index = None
190
191     def __enter__(self):
192         self.open()
193         return self
194
195     def __exit__(self, type, value, tb):
196         self.close()
197
198
199 class Index(BaseIndex):
200     """
201     Class indexing books.
202     """
203     def __init__(self, analyzer=None):
204         super(Index, self).__init__(analyzer)
205
206     def index_tags(self):
207         """
208         Re-index global tag list.
209         Removes all tags from index, then index them again.
210         Indexed fields include: id, name (with and without polish stems), category
211         """
212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213         self.index.deleteDocuments(q)
214
215         for tag in catalogue.models.Tag.objects.all():
216             doc = Document()
217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221             self.index.addDocument(doc)
222
223         for pdtag in PDCounterAuthor.objects.all():
224             doc = Document()
225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230             self.index.addDocument(doc)
231
232     def create_book_doc(self, book):
233         """
234         Create a lucene document referring book id.
235         """
236         doc = Document()
237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238         if book.parent is not None:
239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
240         return doc
241
242     def remove_book(self, book):
243         """Removes a book from search index.
244         book - Book instance."""
245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246         self.index.deleteDocuments(q)
247
248     def index_book(self, book, book_info=None, overwrite=True):
249         """
250         Indexes the book.
251         Creates a lucene document for extracted metadata
252         and calls self.index_content() to index the contents of the book.
253         """
254         if overwrite:
255             self.remove_book(book)
256
257         book_doc = self.create_book_doc(book)
258         meta_fields = self.extract_metadata(book, book_info)
259         for f in meta_fields.values():
260             if isinstance(f, list) or isinstance(f, tuple):
261                 for elem in f:
262                     book_doc.add(elem)
263             else:
264                 book_doc.add(f)
265
266         self.index.addDocument(book_doc)
267         del book_doc
268
269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
270
271     master_tags = [
272         'opowiadanie',
273         'powiesc',
274         'dramat_wierszowany_l',
275         'dramat_wierszowany_lp',
276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
277         'wywiad',
278         ]
279
280     ignore_content_tags = [
281         'uwaga', 'extra',
282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
283         'didaskalia',
284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
285         ]
286
287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
288
289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
290
291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
292
293     def extract_metadata(self, book, book_info=None):
294         """
295         Extract metadata from book and returns a map of fields keyed by fieldname
296         """
297         fields = {}
298
299         if book_info is None:
300             book_info = dcparser.parse(open(book.xml_file.path))
301
302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
305
306         # validator, name
307         for field in dcparser.BookInfo.FIELDS:
308             if hasattr(book_info, field.name):
309                 if not getattr(book_info, field.name):
310                     continue
311                 # since no type information is available, we use validator
312                 type_indicator = field.validator
313                 if type_indicator == dcparser.as_unicode:
314                     s = getattr(book_info, field.name)
315                     if field.multiple:
316                         s = ', '.join(s)
317                     try:
318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319                     except JavaError as je:
320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321                 elif type_indicator == dcparser.as_person:
322                     p = getattr(book_info, field.name)
323                     if isinstance(p, dcparser.Person):
324                         persons = unicode(p)
325                     else:
326                         persons = ', '.join(map(unicode, p))
327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328                 elif type_indicator == dcparser.as_date:
329                     dt = getattr(book_info, field.name)
330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
332
333         # get published date
334         pd = None
335         if hasattr(book_info, 'source_name') and book_info.source_name:
336             match = self.published_date_re.search(book_info.source_name)
337             if match is not None:
338                 pd = str(match.groups()[0])
339         if not pd: pd = ""
340         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
341
342         return fields
343
344     def add_gaps(self, fields, fieldname):
345         """
346         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
347         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
348         """
349         def gap():
350             while True:
351                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
352         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
353
354     def get_master(self, root):
355         """
356         Returns the first master tag from an etree.
357         """
358         for master in root.iter():
359             if master.tag in self.master_tags:
360                 return master
361
362     def index_content(self, book, book_fields=[]):
363         """
364         Walks the book XML and extract content from it.
365         Adds parts for each header tag and for each fragment.
366         """
367         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
368         root = wld.edoc.getroot()
369
370         master = self.get_master(root)
371         if master is None:
372             return []
373
374         def walker(node, ignore_tags=[]):
375
376             if node.tag not in ignore_tags:
377                 yield node, None, None
378                 if node.text is not None:
379                     yield None, node.text, None
380                 for child in list(node):
381                     for b, t, e in walker(child):
382                         yield b, t, e
383                 yield None, None, node
384
385             if node.tail is not None:
386                 yield None, node.tail, None
387             return
388
389         def fix_format(text):
390             #            separator = [u" ", u"\t", u".", u";", u","]
391             if isinstance(text, list):
392                 # need to join it first
393                 text = filter(lambda s: s is not None, content)
394                 text = u' '.join(text)
395                 # for i in range(len(text)):
396                 #     if i > 0:
397                 #         if text[i][0] not in separator\
398                 #             and text[i - 1][-1] not in separator:
399                 #          text.insert(i, u" ")
400
401             return re.sub("(?m)/$", "", text)
402
403         def add_part(snippets, **fields):
404             doc = self.create_book_doc(book)
405             for f in book_fields:
406                 doc.add(f)
407
408             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
409             doc.add(NumericField("header_span", Field.Store.YES, True)\
410                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
411             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
412
413             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
414                           Field.TermVector.WITH_POSITIONS_OFFSETS))
415
416             snip_pos = snippets.add(fields["content"])
417             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
418             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
419
420             if 'fragment_anchor' in fields:
421                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
422                               Field.Store.YES, Field.Index.NOT_ANALYZED))
423
424             if 'themes' in fields:
425                 themes, themes_pl = zip(*[
426                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
427                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
428                      for theme in fields['themes']])
429
430                 themes = self.add_gaps(themes, 'themes')
431                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
432
433                 for t in themes:
434                     doc.add(t)
435                 for t in themes_pl:
436                     doc.add(t)
437
438             return doc
439
440         def give_me_utf8(s):
441             if isinstance(s, unicode):
442                 return s.encode('utf-8')
443             else:
444                 return s
445
446         fragments = {}
447         snippets = Snippets(book.id).open('w')
448         try:
449             for header, position in zip(list(master), range(len(master))):
450
451                 if header.tag in self.skip_header_tags:
452                     continue
453                 if header.tag is etree.Comment:
454                     continue
455
456                 # section content
457                 content = []
458                 footnote = []
459
460                 def all_content(text):
461                     for frag in fragments.values():
462                         frag['content'].append(text)
463                     content.append(text)
464                 handle_text = [all_content]
465
466
467                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
468                     # handle footnotes
469                     if start is not None and start.tag in self.footnote_tags:
470                         footnote = []
471                         def collect_footnote(t):
472                             footnote.append(t)
473                         handle_text.append(collect_footnote)
474                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
475                         handle_text.pop()
476                         doc = add_part(snippets, header_index=position, header_type=header.tag,
477                                        content=u''.join(footnote),
478                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
479                 
480                         self.index.addDocument(doc)
481                         #print "@ footnote text: %s" % footnote
482                         footnote = []
483                     
484                     # handle fragments and themes.
485                     if start is not None and start.tag == 'begin':
486                         fid = start.attrib['id'][1:]
487                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
488
489                     # themes for this fragment
490                     elif start is not None and start.tag == 'motyw':
491                         fid = start.attrib['id'][1:]
492                         handle_text.append(None)
493                         if start.text is not None:
494                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
495                     elif end is not None and end.tag == 'motyw':
496                         handle_text.pop()
497
498                     elif start is not None and start.tag == 'end':
499                         fid = start.attrib['id'][1:]
500                         if fid not in fragments:
501                             continue  # a broken <end> node, skip it
502                         frag = fragments[fid]
503                         if frag['themes'] == []:
504                             continue  # empty themes list.
505                         del fragments[fid]
506
507                         doc = add_part(snippets,
508                                        header_type=frag['start_header'],
509                                        header_index=frag['start_section'],
510                                        header_span=position - frag['start_section'] + 1,
511                                        fragment_anchor=fid,
512                                        content=fix_format(frag['content']),
513                                        themes=frag['themes'])
514                         #print '@ FRAG %s' % frag['content']
515                         self.index.addDocument(doc)
516
517                         # Collect content.
518
519                     if text is not None and handle_text is not []:
520                         hdl = handle_text[-1]
521                         if hdl is not None:
522                             hdl(text)
523
524                         # in the end, add a section text.
525                 doc = add_part(snippets, header_index=position, header_type=header.tag,
526                                content=fix_format(content))
527                 #print '@ CONTENT: %s' % fix_format(content)
528
529                 self.index.addDocument(doc)
530
531         finally:
532             snippets.close()
533
534
535 def log_exception_wrapper(f):
536     def _wrap(*a):
537         try:
538             f(*a)
539         except Exception, e:
540             print("Error in indexing thread: %s" % e)
541             traceback.print_exc()
542             raise e
543     return _wrap
544
545
546 class ReusableIndex(Index):
547     """
548     Works like index, but does not close/optimize Lucene index
549     until program exit (uses atexit hook).
550     This is usefull for importbooks command.
551
552     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
553     """
554     index = None
555
556     def open(self, analyzer=None, threads=4):
557         if ReusableIndex.index is not None:
558             self.index = ReusableIndex.index
559         else:
560             print("opening index")
561             Index.open(self, analyzer)
562             ReusableIndex.index = self.index
563             atexit.register(ReusableIndex.close_reusable)
564
565     # def index_book(self, *args, **kw):
566     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
567     #     ReusableIndex.pool_jobs.append(job)
568
569     @staticmethod
570     def close_reusable():
571         if ReusableIndex.index is not None:
572             ReusableIndex.index.optimize()
573             ReusableIndex.index.close()
574             ReusableIndex.index = None
575
576     def close(self):
577         pass
578
579
580 class JoinSearch(object):
581     """
582     This mixin could be used to handle block join queries.
583     (currently unused)
584     """
585     def __init__(self, *args, **kw):
586         super(JoinSearch, self).__init__(*args, **kw)
587
588     def wrapjoins(self, query, fields=[]):
589         """
590         This functions modifies the query in a recursive way,
591         so Term and Phrase Queries contained, which match
592         provided fields are wrapped in a BlockJoinQuery,
593         and so delegated to children documents.
594         """
595         if BooleanQuery.instance_(query):
596             qs = BooleanQuery.cast_(query)
597             for clause in qs:
598                 clause = BooleanClause.cast_(clause)
599                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
600             return qs
601         else:
602             termset = HashSet()
603             query.extractTerms(termset)
604             for t in termset:
605                 t = Term.cast_(t)
606                 if t.field() not in fields:
607                     return query
608             return BlockJoinQuery(query, self.parent_filter,
609                                   BlockJoinQuery.ScoreMode.Total)
610
611     def bsearch(self, query, max_results=50):
612         q = self.query(query)
613         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
614
615         tops = self.searcher.search(bjq, max_results)
616         bks = []
617         for found in tops.scoreDocs:
618             doc = self.searcher.doc(found.doc)
619             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
620         return (bks, tops.totalHits)
621
622
623 class SearchResult(object):
624     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
625         if tokens_cache is None: tokens_cache = {}
626
627         if score:
628             self._score = score
629         else:
630             self._score = scoreDocs.score
631
632         self.boost = 1.0
633
634         self._hits = []
635         self._processed_hits = None  # processed hits
636
637         stored = search.searcher.doc(scoreDocs.doc)
638         self.book_id = int(stored.get("book_id"))
639
640         pd = stored.get("published_date")
641         if pd is None:
642             pd = 0
643         self.published_date = int(pd)
644
645         header_type = stored.get("header_type")
646         # we have a content hit in some header of fragment
647         if header_type is not None:
648             sec = (header_type, int(stored.get("header_index")))
649             header_span = stored.get('header_span')
650             header_span = header_span is not None and int(header_span) or 1
651
652             fragment = stored.get("fragment_anchor")
653
654             if snippets:
655                 snippets = snippets.replace("/\n", "\n")
656             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
657
658             self._hits.append(hit)
659
660         self.search = search
661         self.searched = searched
662         self.tokens_cache = tokens_cache
663
664     @property
665     def score(self):
666         return self._score * self.boost
667
668     def merge(self, other):
669         if self.book_id != other.book_id:
670             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
671         self._hits += other._hits
672         if other.score > self.score:
673             self._score = other._score
674         return self
675
676     def get_book(self):
677         return catalogue.models.Book.objects.get(id=self.book_id)
678
679     book = property(get_book)
680
681     @property
682     def hits(self):
683         if self._processed_hits is not None:
684             return self._processed_hits
685
686         POSITION = 0
687         FRAGMENT = 1
688         POSITION_INDEX = 1
689         POSITION_SPAN = 2
690         SCORE = 2
691         OTHER = 3
692
693         # to sections and fragments
694         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
695         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
696         sect = filter(lambda s: 0 == len(filter(
697             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
698             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
699             frags)), sect)
700
701         hits = []
702
703         # remove duplicate fragments
704         fragments = {}
705         for f in frags:
706             fid = f[FRAGMENT]
707             if fid in fragments:
708                 if fragments[fid][SCORE] >= f[SCORE]:
709                     continue
710             fragments[fid] = f
711         frags = fragments.values()
712
713         # remove duplicate sections
714         sections = {}
715
716         for s in sect:
717             si = s[POSITION][POSITION_INDEX]
718             # skip existing
719             if si in sections:
720                 if sections[si]['score'] >= s[SCORE]:
721                     continue
722
723             m = {'score': s[SCORE],
724                  'section_number': s[POSITION][POSITION_INDEX] + 1,
725                  }
726             m.update(s[OTHER])
727             sections[si] = m
728
729         hits = sections.values()
730
731         for f in frags:
732             try:
733                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
734             except catalogue.models.Fragment.DoesNotExist:
735                 # stale index
736                 continue
737
738             # Figure out if we were searching for a token matching some word in theme name.
739             themes = frag.tags.filter(category='theme')
740             themes_hit = []
741             if self.searched is not None:
742                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
743                 for theme in themes:
744                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
745                     for t in tokens:
746                         if t in name_tokens:
747                             if not theme in themes_hit:
748                                 themes_hit.append(theme)
749                             break
750
751             m = {'score': f[SCORE],
752                  'fragment': frag,
753                  'section_number': f[POSITION][POSITION_INDEX] + 1,
754                  'themes': themes,
755                  'themes_hit': themes_hit
756                  }
757             m.update(f[OTHER])
758             hits.append(m)
759
760         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
761
762         self._processed_hits = hits
763
764         return hits
765
766     def __unicode__(self):
767         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
768
769     @staticmethod
770     def aggregate(*result_lists):
771         books = {}
772         for rl in result_lists:
773             for r in rl:
774                 if r.book_id in books:
775                     books[r.book_id].merge(r)
776                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
777                 else:
778                     books[r.book_id] = r
779         return books.values()
780
781     def __cmp__(self, other):
782         c = cmp(self.score, other.score)
783         if c == 0:
784             # this is inverted, because earlier date is better
785             return cmp(other.published_date, self.published_date)
786         else:
787             return c
788
789
790 class Hint(object):
791     """
792     Given some hint information (information we already know about)
793     our search target - like author, title (specific book), epoch, genre, kind
794     we can narrow down search using filters.
795     """
796     def __init__(self, search):
797         """
798         Accepts a Searcher instance.
799         """
800         self.search = search
801         self.book_tags = {}
802         self.part_tags = []
803         self._books = []
804
805     def books(self, *books):
806         """
807         Give a hint that we search these books.
808         """
809         self._books = books
810
811     def tags(self, tags):
812         """
813         Give a hint that these Tag objects (a list of)
814         is necessary.
815         """
816         for t in tags:
817             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
818                 lst = self.book_tags.get(t.category, [])
819                 lst.append(t)
820                 self.book_tags[t.category] = lst
821             if t.category in ['theme', 'theme_pl']:
822                 self.part_tags.append(t)
823
824     def tag_filter(self, tags, field='tags'):
825         """
826         Given a lsit of tags and an optional field (but they are normally in tags field)
827         returns a filter accepting only books with specific tags.
828         """
829         q = BooleanQuery()
830
831         for tag in tags:
832             toks = self.search.get_tokens(tag.name, field=field)
833             tag_phrase = PhraseQuery()
834             for tok in toks:
835                 tag_phrase.add(Term(field, tok))
836             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
837
838         return QueryWrapperFilter(q)
839
840     def book_filter(self):
841         """
842         Filters using book tags (all tag kinds except a theme)
843         """
844         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
845         if tags:
846             return self.tag_filter(tags)
847         else:
848             return None
849
850     def part_filter(self):
851         """
852         This filter can be used to look for book parts.
853         It filters on book id and/or themes.
854         """
855         fs = []
856         if self.part_tags:
857             fs.append(self.tag_filter(self.part_tags, field='themes'))
858
859         if self._books != []:
860             bf = BooleanFilter()
861             for b in self._books:
862                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
863                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
864             fs.append(bf)
865
866         return Search.chain_filters(fs)
867
868     def should_search_for_book(self):
869         return self._books == []
870
871     def just_search_in(self, all):
872         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
873         some = []
874         for field in all:
875             if field == 'authors' and 'author' in self.book_tags:
876                 continue
877             if field == 'title' and self._books != []:
878                 continue
879             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
880                 continue
881             some.append(field)
882         return some
883
884
885 class Search(IndexStore):
886     """
887     Search facilities.
888     """
889     def __init__(self, default_field="content"):
890         IndexStore.__init__(self)
891         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
892         # self.analyzer = WLAnalyzer()
893         self.searcher = IndexSearcher(self.store, True)
894         self.parser = QueryParser(Version.LUCENE_34, default_field,
895                                   self.analyzer)
896
897         self.parent_filter = TermsFilter()
898         self.parent_filter.addTerm(Term("is_book", "true"))
899
900     def query(self, query):
901         """Parse query in default Lucene Syntax. (for humans)
902         """
903         return self.parser.parse(query)
904
905     def simple_search(self, query, max_results=50):
906         """Runs a query for books using lucene syntax. (for humans)
907         Returns (books, total_hits)
908         """
909
910         tops = self.searcher.search(self.query(query), max_results)
911         bks = []
912         for found in tops.scoreDocs:
913             doc = self.searcher.doc(found.doc)
914             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
915         return (bks, tops.totalHits)
916
917     def get_tokens(self, searched, field='content', cached=None):
918         """returns tokens analyzed by a proper (for a field) analyzer
919         argument can be: StringReader, string/unicode, or tokens. In the last case
920         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
921         """
922         if cached is not None and field in cached:
923             return cached[field]
924
925         if isinstance(searched, str) or isinstance(searched, unicode):
926             searched = StringReader(searched)
927         elif isinstance(searched, list):
928             return searched
929
930         searched.reset()
931         tokens = self.analyzer.reusableTokenStream(field, searched)
932         toks = []
933         while tokens.incrementToken():
934             cta = tokens.getAttribute(CharTermAttribute.class_)
935             toks.append(cta.toString())
936
937         if cached is not None:
938             cached[field] = toks
939
940         return toks
941
942     def fuzziness(self, fuzzy):
943         """Helper method to sanitize fuzziness"""
944         if not fuzzy:
945             return None
946         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
947             return fuzzy
948         else:
949             return 0.5
950
951     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
952         """
953         Return a PhraseQuery with a series of tokens.
954         """
955         if fuzzy:
956             phrase = MultiPhraseQuery()
957             for t in tokens:
958                 term = Term(field, t)
959                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
960                 fuzzterms = []
961
962                 while True:
963                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
964                     ft = fuzzterm.term()
965                     if ft:
966                         fuzzterms.append(ft)
967                     if not fuzzterm.next(): break
968                 if fuzzterms:
969                     phrase.add(JArray('object')(fuzzterms, Term))
970                 else:
971                     phrase.add(term)
972         else:
973             phrase = PhraseQuery()
974             phrase.setSlop(slop)
975             for t in tokens:
976                 term = Term(field, t)
977                 phrase.add(term)
978         return phrase
979
980     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
981         """
982         Returns term queries joined by boolean query.
983         modal - applies to boolean query
984         fuzzy - should the query by fuzzy.
985         """
986         q = BooleanQuery()
987         for t in tokens:
988             term = Term(field, t)
989             if fuzzy:
990                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
991             else:
992                 term = TermQuery(term)
993             q.add(BooleanClause(term, modal))
994         return q
995
996     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
997                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
998         if filters is None: filters = []
999         if tokens_cache is None: tokens_cache = {}
1000
1001         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1002
1003         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1004         if book:
1005             filters.append(self.term_filter(Term('is_book', 'true')))
1006         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1007
1008         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1009
1010     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1011                     filters=None, tokens_cache=None, boost=None, snippets=True):
1012         if filters is None: filters = []
1013         if tokens_cache is None: tokens_cache = {}
1014
1015         if book:
1016             filters.append(self.term_filter(Term('is_book', 'true')))
1017
1018         query = BooleanQuery()
1019
1020         for fld in fields:
1021             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1022
1023             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1024                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1025
1026         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1027
1028         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1029                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1030
1031     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1032         """
1033         Search for perfect book matches. Just see if the query matches with some author or title,
1034         taking hints into account.
1035         """
1036         fields_to_search = ['authors', 'title']
1037         only_in = None
1038         if hint:
1039             if not hint.should_search_for_book():
1040                 return []
1041             fields_to_search = hint.just_search_in(fields_to_search)
1042             only_in = hint.book_filter()
1043
1044         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1045
1046         books = []
1047         for q in qrys:
1048             top = self.searcher.search(q,
1049                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1050                 max_results)
1051             for found in top.scoreDocs:
1052                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1053         return books
1054
1055     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1056         fields_to_search = ['tags', 'authors', 'title']
1057
1058         only_in = None
1059         if hint:
1060             if not hint.should_search_for_book():
1061                 return []
1062             fields_to_search = hint.just_search_in(fields_to_search)
1063             only_in = hint.book_filter()
1064
1065         tokens = self.get_tokens(searched, field='SIMPLE')
1066
1067         q = BooleanQuery()
1068
1069         for fld in fields_to_search:
1070             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1071                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1072
1073         books = []
1074         top = self.searcher.search(q,
1075                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1076             max_results)
1077         for found in top.scoreDocs:
1078             books.append(SearchResult(self, found, how_found="search_book"))
1079
1080         return books
1081
1082     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1083         """
1084         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1085         some part/fragment of the book.
1086         """
1087         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1088
1089         flt = None
1090         if hint:
1091             flt = hint.part_filter()
1092
1093         books = []
1094         for q in qrys:
1095             top = self.searcher.search(q,
1096                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1097                                                            flt]),
1098                                        max_results)
1099             for found in top.scoreDocs:
1100                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1101
1102         return books
1103
1104     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1105         """
1106         Tries to use search terms to match different fields of book (or its parts).
1107         E.g. one word can be an author survey, another be a part of the title, and the rest
1108         are some words from third chapter.
1109         """
1110         if tokens_cache is None: tokens_cache = {}
1111         books = []
1112         only_in = None
1113
1114         if hint:
1115             only_in = hint.part_filter()
1116
1117         # content only query : themes x content
1118         q = BooleanQuery()
1119
1120         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1121         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1122
1123         # only search in themes when we do not already filter by themes
1124         if hint is None or hint.just_search_in(['themes']) != []:
1125             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1126                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1127
1128         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1129                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1130
1131         topDocs = self.searcher.search(q, only_in, max_results)
1132         for found in topDocs.scoreDocs:
1133             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1134             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1135
1136         # query themes/content x author/title/tags
1137         q = BooleanQuery()
1138         in_content = BooleanQuery()
1139         in_meta = BooleanQuery()
1140
1141         for fld in ['themes_pl', 'content']:
1142             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1143
1144         for fld in ['tags', 'authors', 'title']:
1145             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1146
1147         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1148         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1149
1150         topDocs = self.searcher.search(q, only_in, max_results)
1151         for found in topDocs.scoreDocs:
1152             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1153             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1154
1155         return books
1156
1157     # def multisearch(self, query, max_results=50):
1158     #     """
1159     #     Search strategy:
1160     #     - (phrase) OR -> content
1161     #                   -> title
1162     #                   -> authors
1163     #     - (keywords)  -> authors
1164     #                   -> motyw
1165     #                   -> tags
1166     #                   -> content
1167     #     """
1168         # queryreader = StringReader(query)
1169         # tokens = self.get_tokens(queryreader)
1170
1171         # top_level = BooleanQuery()
1172         # Should = BooleanClause.Occur.SHOULD
1173
1174         # phrase_level = BooleanQuery()
1175         # phrase_level.setBoost(1.3)
1176
1177         # p_content = self.make_phrase(tokens, joined=True)
1178         # p_title = self.make_phrase(tokens, 'title')
1179         # p_author = self.make_phrase(tokens, 'author')
1180
1181         # phrase_level.add(BooleanClause(p_content, Should))
1182         # phrase_level.add(BooleanClause(p_title, Should))
1183         # phrase_level.add(BooleanClause(p_author, Should))
1184
1185         # kw_level = BooleanQuery()
1186
1187         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1188         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1189         # kw_level.add(j_themes, Should)
1190         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1191         # j_con = self.make_term_query(tokens, joined=True)
1192         # kw_level.add(j_con, Should)
1193
1194         # top_level.add(BooleanClause(phrase_level, Should))
1195         # top_level.add(BooleanClause(kw_level, Should))
1196
1197         # return None
1198
1199     def get_snippets(self, scoreDoc, query, field='content'):
1200         """
1201         Returns a snippet for found scoreDoc.
1202         """
1203         htmlFormatter = SimpleHTMLFormatter()
1204         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1205
1206         stored = self.searcher.doc(scoreDoc.doc)
1207
1208         position = stored.get('snippets_position')
1209         length = stored.get('snippets_length')
1210         if position is None or length is None:
1211             return None
1212         # locate content.
1213         snippets = Snippets(stored.get('book_id')).open()
1214         try:
1215             text = snippets.get((int(position),
1216                                  int(length)))
1217         finally:
1218             snippets.close()
1219
1220         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1221         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1222         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1223
1224         return snip
1225
1226     @staticmethod
1227     def enum_to_array(enum):
1228         """
1229         Converts a lucene TermEnum to array of Terms, suitable for
1230         addition to queries
1231         """
1232         terms = []
1233
1234         while True:
1235             t = enum.term()
1236             if t:
1237                 terms.append(t)
1238             if not enum.next(): break
1239
1240         if terms:
1241             return JArray('object')(terms, Term)
1242
1243     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1244         """
1245         Search for Tag objects using query.
1246         """
1247         if not pdcounter:
1248             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1249         tops = self.searcher.search(query, filters, max_results)
1250
1251         tags = []
1252         for found in tops.scoreDocs:
1253             doc = self.searcher.doc(found.doc)
1254             is_pdcounter = doc.get('is_pdcounter')
1255             if is_pdcounter:
1256                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1257             else:
1258                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1259                 # don't add the pdcounter tag if same tag already exists
1260             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1261                 tags.append(tag)
1262                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1263         print 'returning %s' % tags
1264         return tags
1265
1266     def search_books(self, query, filter=None, max_results=10):
1267         """
1268         Searches for Book objects using query
1269         """
1270         bks = []
1271         tops = self.searcher.search(query, filter, max_results)
1272         for found in tops.scoreDocs:
1273             doc = self.searcher.doc(found.doc)
1274             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1275         return bks
1276
1277     def make_prefix_phrase(self, toks, field):
1278         q = MultiPhraseQuery()
1279         for i in range(len(toks)):
1280             t = Term(field, toks[i])
1281             if i == len(toks) - 1:
1282                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1283                 if pterms:
1284                     q.add(pterms)
1285                 else:
1286                     q.add(t)
1287             else:
1288                 q.add(t)
1289         return q
1290
1291     @staticmethod
1292     def term_filter(term, inverse=False):
1293         only_term = TermsFilter()
1294         only_term.addTerm(term)
1295
1296         if inverse:
1297             neg = BooleanFilter()
1298             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1299             only_term = neg
1300
1301         return only_term
1302
1303     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1304         """
1305         Return auto-complete hints for tags
1306         using prefix search.
1307         """
1308         toks = self.get_tokens(string, field='SIMPLE')
1309         top = BooleanQuery()
1310
1311         for field in ['tag_name', 'tag_name_pl']:
1312             if prefix:
1313                 q = self.make_prefix_phrase(toks, field)
1314             else:
1315                 q = self.make_term_query(toks, field)
1316             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1317
1318         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1319
1320         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1321
1322     def hint_books(self, string, max_results=50, prefix=True):
1323         """
1324         Returns auto-complete hints for book titles
1325         Because we do not index 'pseudo' title-tags.
1326         Prefix search.
1327         """
1328         toks = self.get_tokens(string, field='SIMPLE')
1329
1330         if prefix:
1331             q = self.make_prefix_phrase(toks, 'title')
1332         else:
1333             q = self.make_term_query(toks, 'title')
1334
1335         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1336
1337     @staticmethod
1338     def chain_filters(filters, op=ChainedFilter.AND):
1339         """
1340         Chains a filter list together
1341         """
1342         filters = filter(lambda x: x is not None, filters)
1343         if not filters or filters is []:
1344             return None
1345         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1346         return chf
1347
1348     def filtered_categories(self, tags):
1349         """
1350         Return a list of tag categories, present in tags list.
1351         """
1352         cats = {}
1353         for t in tags:
1354             cats[t.category] = True
1355         return cats.keys()
1356
1357     def hint(self):
1358         return Hint(self)