d63d3f8b4dff220fe44fbef0726b2f9d23598f2b
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, analyzer=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         self.index = IndexWriter(self.store, self.analyzer,\
176                                  IndexWriter.MaxFieldLength.LIMITED)
177         return self.index
178
179     def optimize(self):
180         self.index.optimize()
181
182     def close(self):
183         try:
184             self.index.optimize()
185         except JavaError, je:
186             print "Error during optimize phase, check index: %s" % je
187
188         self.index.close()
189         self.index = None
190
191     def __enter__(self):
192         self.open()
193         return self
194
195     def __exit__(self, type, value, tb):
196         self.close()
197
198
199 class Index(BaseIndex):
200     """
201     Class indexing books.
202     """
203     def __init__(self, analyzer=None):
204         super(Index, self).__init__(analyzer)
205
206     def index_tags(self):
207         """
208         Re-index global tag list.
209         Removes all tags from index, then index them again.
210         Indexed fields include: id, name (with and without polish stems), category
211         """
212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213         self.index.deleteDocuments(q)
214
215         for tag in catalogue.models.Tag.objects.all():
216             doc = Document()
217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221             self.index.addDocument(doc)
222
223         for pdtag in PDCounterAuthor.objects.all():
224             doc = Document()
225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230             self.index.addDocument(doc)
231
232     def create_book_doc(self, book):
233         """
234         Create a lucene document referring book id.
235         """
236         doc = Document()
237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238         if book.parent is not None:
239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
240         return doc
241
242     def remove_book(self, book):
243         """Removes a book from search index.
244         book - Book instance."""
245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246         self.index.deleteDocuments(q)
247
248     def index_book(self, book, book_info=None, overwrite=True):
249         """
250         Indexes the book.
251         Creates a lucene document for extracted metadata
252         and calls self.index_content() to index the contents of the book.
253         """
254         if overwrite:
255             self.remove_book(book)
256
257         book_doc = self.create_book_doc(book)
258         meta_fields = self.extract_metadata(book, book_info)
259         for f in meta_fields.values():
260             if isinstance(f, list) or isinstance(f, tuple):
261                 for elem in f:
262                     book_doc.add(elem)
263             else:
264                 book_doc.add(f)
265
266         self.index.addDocument(book_doc)
267         del book_doc
268
269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
270
271     master_tags = [
272         'opowiadanie',
273         'powiesc',
274         'dramat_wierszowany_l',
275         'dramat_wierszowany_lp',
276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
277         'wywiad',
278         ]
279
280     ignore_content_tags = [
281         'uwaga', 'extra',
282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
283         'didaskalia',
284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
285         ]
286
287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
288
289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
290
291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
292
293     def extract_metadata(self, book, book_info=None):
294         """
295         Extract metadata from book and returns a map of fields keyed by fieldname
296         """
297         fields = {}
298
299         if book_info is None:
300             book_info = dcparser.parse(open(book.xml_file.path))
301
302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
305
306         # validator, name
307         for field in dcparser.BookInfo.FIELDS:
308             if hasattr(book_info, field.name):
309                 if not getattr(book_info, field.name):
310                     continue
311                 # since no type information is available, we use validator
312                 type_indicator = field.validator
313                 if type_indicator == dcparser.as_unicode:
314                     s = getattr(book_info, field.name)
315                     if field.multiple:
316                         s = ', '.join(s)
317                     try:
318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319                     except JavaError as je:
320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321                 elif type_indicator == dcparser.as_person:
322                     p = getattr(book_info, field.name)
323                     if isinstance(p, dcparser.Person):
324                         persons = unicode(p)
325                     else:
326                         persons = ', '.join(map(unicode, p))
327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328                 elif type_indicator == dcparser.as_date:
329                     dt = getattr(book_info, field.name)
330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
332
333         # get published date
334         source = book_info.source_name
335         match = self.published_date_re.search(source)
336         if match is not None:
337             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
338
339         return fields
340
341     def add_gaps(self, fields, fieldname):
342         """
343         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
344         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
345         """
346         def gap():
347             while True:
348                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
349         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
350
351     def get_master(self, root):
352         """
353         Returns the first master tag from an etree.
354         """
355         for master in root.iter():
356             if master.tag in self.master_tags:
357                 return master
358
359     def index_content(self, book, book_fields=[]):
360         """
361         Walks the book XML and extract content from it.
362         Adds parts for each header tag and for each fragment.
363         """
364         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
365         root = wld.edoc.getroot()
366
367         master = self.get_master(root)
368         if master is None:
369             return []
370
371         def walker(node, ignore_tags=[]):
372             yield node, None
373             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
374                 for b, e in walker(child):
375                     yield b, e
376             yield None, node
377             return
378
379         def fix_format(text):
380             #            separator = [u" ", u"\t", u".", u";", u","]
381             if isinstance(text, list):
382                 # need to join it first
383                 text = filter(lambda s: s is not None, content)
384                 text = u' '.join(text)
385                 # for i in range(len(text)):
386                 #     if i > 0:
387                 #         if text[i][0] not in separator\
388                 #             and text[i - 1][-1] not in separator:
389                 #          text.insert(i, u" ")
390
391             return re.sub("(?m)/$", "", text)
392
393         def add_part(snippets, **fields):
394             doc = self.create_book_doc(book)
395             for f in book_fields:
396                 doc.add(f)
397
398             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
399             doc.add(NumericField("header_span", Field.Store.YES, True)\
400                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
401             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
402
403             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
404                           Field.TermVector.WITH_POSITIONS_OFFSETS))
405
406             snip_pos = snippets.add(fields["content"])
407             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
408             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
409
410             if 'fragment_anchor' in fields:
411                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
412                               Field.Store.YES, Field.Index.NOT_ANALYZED))
413
414             if 'themes' in fields:
415                 themes, themes_pl = zip(*[
416                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
417                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
418                      for theme in fields['themes']])
419
420                 themes = self.add_gaps(themes, 'themes')
421                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
422
423                 for t in themes:
424                     doc.add(t)
425                 for t in themes_pl:
426                     doc.add(t)
427
428             return doc
429
430         def give_me_utf8(s):
431             if isinstance(s, unicode):
432                 return s.encode('utf-8')
433             else:
434                 return s
435
436         fragments = {}
437         snippets = Snippets(book.id).open('w')
438         try:
439             for header, position in zip(list(master), range(len(master))):
440
441                 if header.tag in self.skip_header_tags:
442                     continue
443                 if header.tag is etree.Comment:
444                     continue
445
446                 # section content
447                 content = []
448                 footnote = None
449
450                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
451                     # handle footnotes
452                     # if start is not None and start.tag in self.footnote_tags:
453                     #     footnote = ' '.join(start.itertext())
454                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
455                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
456                     #                    content=footnote)
457
458                     #     self.index.addDocument(doc)
459
460                     #     footnote = None
461
462                     # handle fragments and themes.
463                     if start is not None and start.tag == 'begin':
464                         fid = start.attrib['id'][1:]
465                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
466
467                     elif start is not None and start.tag == 'motyw':
468                         fid = start.attrib['id'][1:]
469                         if start.text is not None:
470                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
471
472                     elif start is not None and start.tag == 'end':
473                         fid = start.attrib['id'][1:]
474                         if fid not in fragments:
475                             continue  # a broken <end> node, skip it
476                                       #                        import pdb; pdb.set_trace()
477                         frag = fragments[fid]
478                         if frag['themes'] == []:
479                             continue  # empty themes list.
480                         del fragments[fid]
481
482                         doc = add_part(snippets,
483                                        header_type=frag['start_header'],
484                                        header_index=frag['start_section'],
485                                        header_span=position - frag['start_section'] + 1,
486                                        fragment_anchor=fid,
487                                        content=fix_format(frag['content']),
488                                        themes=frag['themes'])
489
490                         self.index.addDocument(doc)
491
492                         # Collect content.
493                     elif start is not None:
494                         for frag in fragments.values():
495                             frag['content'].append(start.text)
496                         content.append(start.text)
497                     elif end is not None:
498                         for frag in fragments.values():
499                             frag['content'].append(end.tail)
500                         content.append(end.tail)
501
502                         # in the end, add a section text.
503                 doc = add_part(snippets, header_index=position, header_type=header.tag,
504                                content=fix_format(content))
505
506                 self.index.addDocument(doc)
507
508         finally:
509             snippets.close()
510
511
512 def log_exception_wrapper(f):
513     def _wrap(*a):
514         try:
515             f(*a)
516         except Exception, e:
517             print("Error in indexing thread: %s" % e)
518             traceback.print_exc()
519             raise e
520     return _wrap
521
522
523 class ReusableIndex(Index):
524     """
525     Works like index, but does not close/optimize Lucene index
526     until program exit (uses atexit hook).
527     This is usefull for importbooks command.
528
529     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
530     """
531     index = None
532
533     def open(self, analyzer=None, threads=4):
534         if ReusableIndex.index is not None:
535             self.index = ReusableIndex.index
536         else:
537             print("opening index")
538             Index.open(self, analyzer)
539             ReusableIndex.index = self.index
540             atexit.register(ReusableIndex.close_reusable)
541
542     # def index_book(self, *args, **kw):
543     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
544     #     ReusableIndex.pool_jobs.append(job)
545
546     @staticmethod
547     def close_reusable():
548         if ReusableIndex.index is not None:
549             ReusableIndex.index.optimize()
550             ReusableIndex.index.close()
551             ReusableIndex.index = None
552
553     def close(self):
554         pass
555
556
557 class JoinSearch(object):
558     """
559     This mixin could be used to handle block join queries.
560     (currently unused)
561     """
562     def __init__(self, *args, **kw):
563         super(JoinSearch, self).__init__(*args, **kw)
564
565     def wrapjoins(self, query, fields=[]):
566         """
567         This functions modifies the query in a recursive way,
568         so Term and Phrase Queries contained, which match
569         provided fields are wrapped in a BlockJoinQuery,
570         and so delegated to children documents.
571         """
572         if BooleanQuery.instance_(query):
573             qs = BooleanQuery.cast_(query)
574             for clause in qs:
575                 clause = BooleanClause.cast_(clause)
576                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
577             return qs
578         else:
579             termset = HashSet()
580             query.extractTerms(termset)
581             for t in termset:
582                 t = Term.cast_(t)
583                 if t.field() not in fields:
584                     return query
585             return BlockJoinQuery(query, self.parent_filter,
586                                   BlockJoinQuery.ScoreMode.Total)
587
588     def bsearch(self, query, max_results=50):
589         q = self.query(query)
590         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
591
592         tops = self.searcher.search(bjq, max_results)
593         bks = []
594         for found in tops.scoreDocs:
595             doc = self.searcher.doc(found.doc)
596             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
597         return (bks, tops.totalHits)
598
599
600 class SearchResult(object):
601     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
602         if tokens_cache is None: tokens_cache = {}
603
604         if score:
605             self._score = score
606         else:
607             self._score = scoreDocs.score
608
609         self.boost = 1.0
610
611         self._hits = []
612         self._processed_hits = None  # processed hits
613
614         stored = search.searcher.doc(scoreDocs.doc)
615         self.book_id = int(stored.get("book_id"))
616
617         pd = stored.get("published_date")
618         if pd is None:
619             pd = 0
620         self.published_date = int(pd)
621
622         header_type = stored.get("header_type")
623         # we have a content hit in some header of fragment
624         if header_type is not None:
625             sec = (header_type, int(stored.get("header_index")))
626             header_span = stored.get('header_span')
627             header_span = header_span is not None and int(header_span) or 1
628
629             fragment = stored.get("fragment_anchor")
630
631             if snippets:
632                 snippets = snippets.replace("/\n", "\n")
633             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
634
635             self._hits.append(hit)
636
637         self.search = search
638         self.searched = searched
639         self.tokens_cache = tokens_cache
640
641     @property
642     def score(self):
643         return self._score * self.boost
644
645     def merge(self, other):
646         if self.book_id != other.book_id:
647             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
648         self._hits += other._hits
649         if other.score > self.score:
650             self._score = other._score
651         return self
652
653     def get_book(self):
654         return catalogue.models.Book.objects.get(id=self.book_id)
655
656     book = property(get_book)
657
658     @property
659     def hits(self):
660         if self._processed_hits is not None:
661             return self._processed_hits
662
663         POSITION = 0
664         FRAGMENT = 1
665         POSITION_INDEX = 1
666         POSITION_SPAN = 2
667         SCORE = 2
668         OTHER = 3
669
670         # to sections and fragments
671         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
672         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
673         sect = filter(lambda s: 0 == len(filter(
674             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
675             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
676             frags)), sect)
677
678         hits = []
679
680         # remove duplicate fragments
681         fragments = {}
682         for f in frags:
683             fid = f[FRAGMENT]
684             if fid in fragments:
685                 if fragments[fid][SCORE] >= f[SCORE]:
686                     continue
687             fragments[fid] = f
688         frags = fragments.values()
689
690         # remove duplicate sections
691         sections = {}
692
693         for s in sect:
694             si = s[POSITION][POSITION_INDEX]
695             # skip existing
696             if si in sections:
697                 if sections[si]['score'] >= s[SCORE]:
698                     continue
699
700             m = {'score': s[SCORE],
701                  'section_number': s[POSITION][POSITION_INDEX] + 1,
702                  }
703             m.update(s[OTHER])
704             sections[si] = m
705
706         hits = sections.values()
707
708         for f in frags:
709             try:
710                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
711             except catalogue.models.Fragment.DoesNotExist:
712                 # stale index
713                 continue
714
715             # Figure out if we were searching for a token matching some word in theme name.
716             themes = frag.tags.filter(category='theme')
717             themes_hit = []
718             if self.searched is not None:
719                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
720                 for theme in themes:
721                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
722                     for t in tokens:
723                         if t in name_tokens:
724                             if not theme in themes_hit:
725                                 themes_hit.append(theme)
726                             break
727
728             m = {'score': f[SCORE],
729                  'fragment': frag,
730                  'section_number': f[POSITION][POSITION_INDEX] + 1,
731                  'themes': themes,
732                  'themes_hit': themes_hit
733                  }
734             m.update(f[OTHER])
735             hits.append(m)
736
737         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
738
739         self._processed_hits = hits
740
741         return hits
742
743     def __unicode__(self):
744         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
745
746     @staticmethod
747     def aggregate(*result_lists):
748         books = {}
749         for rl in result_lists:
750             for r in rl:
751                 if r.book_id in books:
752                     books[r.book_id].merge(r)
753                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
754                 else:
755                     books[r.book_id] = r
756         return books.values()
757
758     def __cmp__(self, other):
759         c = cmp(self.score, other.score)
760         if c == 0:
761             if not hasattr(other,'published_date') or not hasattr(self, 'published_date'):
762                 import pdb; pdb.set_trace()
763             # this is inverted, because earlier date is better
764             return cmp(other.published_date, self.published_date)
765         else:
766             return c
767
768
769 class Hint(object):
770     """
771     Given some hint information (information we already know about)
772     our search target - like author, title (specific book), epoch, genre, kind
773     we can narrow down search using filters.
774     """
775     def __init__(self, search):
776         """
777         Accepts a Searcher instance.
778         """
779         self.search = search
780         self.book_tags = {}
781         self.part_tags = []
782         self._books = []
783
784     def books(self, *books):
785         """
786         Give a hint that we search these books.
787         """
788         self._books = books
789
790     def tags(self, tags):
791         """
792         Give a hint that these Tag objects (a list of)
793         is necessary.
794         """
795         for t in tags:
796             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
797                 lst = self.book_tags.get(t.category, [])
798                 lst.append(t)
799                 self.book_tags[t.category] = lst
800             if t.category in ['theme', 'theme_pl']:
801                 self.part_tags.append(t)
802
803     def tag_filter(self, tags, field='tags'):
804         """
805         Given a lsit of tags and an optional field (but they are normally in tags field)
806         returns a filter accepting only books with specific tags.
807         """
808         q = BooleanQuery()
809
810         for tag in tags:
811             toks = self.search.get_tokens(tag.name, field=field)
812             tag_phrase = PhraseQuery()
813             for tok in toks:
814                 tag_phrase.add(Term(field, tok))
815             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
816
817         return QueryWrapperFilter(q)
818
819     def book_filter(self):
820         """
821         Filters using book tags (all tag kinds except a theme)
822         """
823         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
824         if tags:
825             return self.tag_filter(tags)
826         else:
827             return None
828
829     def part_filter(self):
830         """
831         This filter can be used to look for book parts.
832         It filters on book id and/or themes.
833         """
834         fs = []
835         if self.part_tags:
836             fs.append(self.tag_filter(self.part_tags, field='themes'))
837
838         if self._books != []:
839             bf = BooleanFilter()
840             for b in self._books:
841                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
842                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
843             fs.append(bf)
844
845         return Search.chain_filters(fs)
846
847     def should_search_for_book(self):
848         return self._books == []
849
850     def just_search_in(self, all):
851         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
852         some = []
853         for field in all:
854             if field == 'authors' and 'author' in self.book_tags:
855                 continue
856             if field == 'title' and self._books != []:
857                 continue
858             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
859                 continue
860             some.append(field)
861         return some
862
863
864 class Search(IndexStore):
865     """
866     Search facilities.
867     """
868     def __init__(self, default_field="content"):
869         IndexStore.__init__(self)
870         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
871         # self.analyzer = WLAnalyzer()
872         self.searcher = IndexSearcher(self.store, True)
873         self.parser = QueryParser(Version.LUCENE_34, default_field,
874                                   self.analyzer)
875
876         self.parent_filter = TermsFilter()
877         self.parent_filter.addTerm(Term("is_book", "true"))
878
879     def query(self, query):
880         """Parse query in default Lucene Syntax. (for humans)
881         """
882         return self.parser.parse(query)
883
884     def simple_search(self, query, max_results=50):
885         """Runs a query for books using lucene syntax. (for humans)
886         Returns (books, total_hits)
887         """
888
889         tops = self.searcher.search(self.query(query), max_results)
890         bks = []
891         for found in tops.scoreDocs:
892             doc = self.searcher.doc(found.doc)
893             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
894         return (bks, tops.totalHits)
895
896     def get_tokens(self, searched, field='content', cached=None):
897         """returns tokens analyzed by a proper (for a field) analyzer
898         argument can be: StringReader, string/unicode, or tokens. In the last case
899         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
900         """
901         if cached is not None and field in cached:
902             return cached[field]
903
904         if isinstance(searched, str) or isinstance(searched, unicode):
905             searched = StringReader(searched)
906         elif isinstance(searched, list):
907             return searched
908
909         searched.reset()
910         tokens = self.analyzer.reusableTokenStream(field, searched)
911         toks = []
912         while tokens.incrementToken():
913             cta = tokens.getAttribute(CharTermAttribute.class_)
914             toks.append(cta.toString())
915
916         if cached is not None:
917             cached[field] = toks
918
919         return toks
920
921     def fuzziness(self, fuzzy):
922         """Helper method to sanitize fuzziness"""
923         if not fuzzy:
924             return None
925         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
926             return fuzzy
927         else:
928             return 0.5
929
930     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
931         """
932         Return a PhraseQuery with a series of tokens.
933         """
934         if fuzzy:
935             phrase = MultiPhraseQuery()
936             for t in tokens:
937                 term = Term(field, t)
938                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
939                 fuzzterms = []
940
941                 while True:
942                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
943                     ft = fuzzterm.term()
944                     if ft:
945                         fuzzterms.append(ft)
946                     if not fuzzterm.next(): break
947                 if fuzzterms:
948                     phrase.add(JArray('object')(fuzzterms, Term))
949                 else:
950                     phrase.add(term)
951         else:
952             phrase = PhraseQuery()
953             phrase.setSlop(slop)
954             for t in tokens:
955                 term = Term(field, t)
956                 phrase.add(term)
957         return phrase
958
959     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
960         """
961         Returns term queries joined by boolean query.
962         modal - applies to boolean query
963         fuzzy - should the query by fuzzy.
964         """
965         q = BooleanQuery()
966         for t in tokens:
967             term = Term(field, t)
968             if fuzzy:
969                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
970             else:
971                 term = TermQuery(term)
972             q.add(BooleanClause(term, modal))
973         return q
974
975     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
976                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
977         if filters is None: filters = []
978         if tokens_cache is None: tokens_cache = {}
979
980         tokens = self.get_tokens(searched, field, cached=tokens_cache)
981
982         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
983         if book:
984             filters.append(self.term_filter(Term('is_book', 'true')))
985         top = self.searcher.search(query, self.chain_filters(filters), max_results)
986
987         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
988
989     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
990                     filters=None, tokens_cache=None, boost=None, snippets=True):
991         if filters is None: filters = []
992         if tokens_cache is None: tokens_cache = {}
993
994         if book:
995             filters.append(self.term_filter(Term('is_book', 'true')))
996
997         query = BooleanQuery()
998
999         for fld in fields:
1000             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1001
1002             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1003                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1004
1005         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1006
1007         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1008                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1009
1010     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1011         """
1012         Search for perfect book matches. Just see if the query matches with some author or title,
1013         taking hints into account.
1014         """
1015         fields_to_search = ['authors', 'title']
1016         only_in = None
1017         if hint:
1018             if not hint.should_search_for_book():
1019                 return []
1020             fields_to_search = hint.just_search_in(fields_to_search)
1021             only_in = hint.book_filter()
1022
1023         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1024
1025         books = []
1026         for q in qrys:
1027             top = self.searcher.search(q,
1028                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1029                 max_results)
1030             for found in top.scoreDocs:
1031                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1032         return books
1033
1034     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1035         fields_to_search = ['tags', 'authors', 'title']
1036
1037         only_in = None
1038         if hint:
1039             if not hint.should_search_for_book():
1040                 return []
1041             fields_to_search = hint.just_search_in(fields_to_search)
1042             only_in = hint.book_filter()
1043
1044         tokens = self.get_tokens(searched, field='SIMPLE')
1045
1046         q = BooleanQuery()
1047
1048         for fld in fields_to_search:
1049             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1050                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1051
1052         books = []
1053         top = self.searcher.search(q,
1054                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1055             max_results)
1056         for found in top.scoreDocs:
1057             books.append(SearchResult(self, found, how_found="search_book"))
1058
1059         return books
1060
1061     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1062         """
1063         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1064         some part/fragment of the book.
1065         """
1066         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1067
1068         flt = None
1069         if hint:
1070             flt = hint.part_filter()
1071
1072         books = []
1073         for q in qrys:
1074             top = self.searcher.search(q,
1075                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1076                                                            flt]),
1077                                        max_results)
1078             for found in top.scoreDocs:
1079                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1080
1081         return books
1082
1083     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1084         """
1085         Tries to use search terms to match different fields of book (or its parts).
1086         E.g. one word can be an author survey, another be a part of the title, and the rest
1087         are some words from third chapter.
1088         """
1089         if tokens_cache is None: tokens_cache = {}
1090         books = []
1091         only_in = None
1092
1093         if hint:
1094             only_in = hint.part_filter()
1095
1096         # content only query : themes x content
1097         q = BooleanQuery()
1098
1099         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1100         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1101
1102         # only search in themes when we do not already filter by themes
1103         if hint is None or hint.just_search_in(['themes']) != []:
1104             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1105                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1106
1107         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1108                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1109
1110         topDocs = self.searcher.search(q, only_in, max_results)
1111         for found in topDocs.scoreDocs:
1112             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1113             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1114
1115         # query themes/content x author/title/tags
1116         q = BooleanQuery()
1117         in_content = BooleanQuery()
1118         in_meta = BooleanQuery()
1119
1120         for fld in ['themes_pl', 'content']:
1121             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1122
1123         for fld in ['tags', 'authors', 'title']:
1124             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1125
1126         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1127         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1128
1129         topDocs = self.searcher.search(q, only_in, max_results)
1130         for found in topDocs.scoreDocs:
1131             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1132             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1133
1134         return books
1135
1136     # def multisearch(self, query, max_results=50):
1137     #     """
1138     #     Search strategy:
1139     #     - (phrase) OR -> content
1140     #                   -> title
1141     #                   -> authors
1142     #     - (keywords)  -> authors
1143     #                   -> motyw
1144     #                   -> tags
1145     #                   -> content
1146     #     """
1147         # queryreader = StringReader(query)
1148         # tokens = self.get_tokens(queryreader)
1149
1150         # top_level = BooleanQuery()
1151         # Should = BooleanClause.Occur.SHOULD
1152
1153         # phrase_level = BooleanQuery()
1154         # phrase_level.setBoost(1.3)
1155
1156         # p_content = self.make_phrase(tokens, joined=True)
1157         # p_title = self.make_phrase(tokens, 'title')
1158         # p_author = self.make_phrase(tokens, 'author')
1159
1160         # phrase_level.add(BooleanClause(p_content, Should))
1161         # phrase_level.add(BooleanClause(p_title, Should))
1162         # phrase_level.add(BooleanClause(p_author, Should))
1163
1164         # kw_level = BooleanQuery()
1165
1166         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1167         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1168         # kw_level.add(j_themes, Should)
1169         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1170         # j_con = self.make_term_query(tokens, joined=True)
1171         # kw_level.add(j_con, Should)
1172
1173         # top_level.add(BooleanClause(phrase_level, Should))
1174         # top_level.add(BooleanClause(kw_level, Should))
1175
1176         # return None
1177
1178     def get_snippets(self, scoreDoc, query, field='content'):
1179         """
1180         Returns a snippet for found scoreDoc.
1181         """
1182         htmlFormatter = SimpleHTMLFormatter()
1183         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1184
1185         stored = self.searcher.doc(scoreDoc.doc)
1186
1187         position = stored.get('snippets_position')
1188         length = stored.get('snippets_length')
1189         if position is None or length is None:
1190             return None
1191         # locate content.
1192         snippets = Snippets(stored.get('book_id')).open()
1193         try:
1194             text = snippets.get((int(position),
1195                                  int(length)))
1196         finally:
1197             snippets.close()
1198
1199         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1200         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1201         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1202
1203         return snip
1204
1205     @staticmethod
1206     def enum_to_array(enum):
1207         """
1208         Converts a lucene TermEnum to array of Terms, suitable for
1209         addition to queries
1210         """
1211         terms = []
1212
1213         while True:
1214             t = enum.term()
1215             if t:
1216                 terms.append(t)
1217             if not enum.next(): break
1218
1219         if terms:
1220             return JArray('object')(terms, Term)
1221
1222     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1223         """
1224         Search for Tag objects using query.
1225         """
1226         if not pdcounter:
1227             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1228         tops = self.searcher.search(query, filters, max_results)
1229
1230         tags = []
1231         for found in tops.scoreDocs:
1232             doc = self.searcher.doc(found.doc)
1233             is_pdcounter = doc.get('is_pdcounter')
1234             if is_pdcounter:
1235                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1236             else:
1237                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1238                 # don't add the pdcounter tag if same tag already exists
1239             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1240                 tags.append(tag)
1241                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1242         print 'returning %s' % tags
1243         return tags
1244
1245     def search_books(self, query, filter=None, max_results=10):
1246         """
1247         Searches for Book objects using query
1248         """
1249         bks = []
1250         tops = self.searcher.search(query, filter, max_results)
1251         for found in tops.scoreDocs:
1252             doc = self.searcher.doc(found.doc)
1253             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1254         return bks
1255
1256     def make_prefix_phrase(self, toks, field):
1257         q = MultiPhraseQuery()
1258         for i in range(len(toks)):
1259             t = Term(field, toks[i])
1260             if i == len(toks) - 1:
1261                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1262                 if pterms:
1263                     q.add(pterms)
1264                 else:
1265                     q.add(t)
1266             else:
1267                 q.add(t)
1268         return q
1269
1270     @staticmethod
1271     def term_filter(term, inverse=False):
1272         only_term = TermsFilter()
1273         only_term.addTerm(term)
1274
1275         if inverse:
1276             neg = BooleanFilter()
1277             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1278             only_term = neg
1279
1280         return only_term
1281
1282     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1283         """
1284         Return auto-complete hints for tags
1285         using prefix search.
1286         """
1287         toks = self.get_tokens(string, field='SIMPLE')
1288         top = BooleanQuery()
1289
1290         for field in ['tag_name', 'tag_name_pl']:
1291             if prefix:
1292                 q = self.make_prefix_phrase(toks, field)
1293             else:
1294                 q = self.make_term_query(toks, field)
1295             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1296
1297         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1298
1299         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1300
1301     def hint_books(self, string, max_results=50, prefix=True):
1302         """
1303         Returns auto-complete hints for book titles
1304         Because we do not index 'pseudo' title-tags.
1305         Prefix search.
1306         """
1307         toks = self.get_tokens(string, field='SIMPLE')
1308
1309         if prefix:
1310             q = self.make_prefix_phrase(toks, 'title')
1311         else:
1312             q = self.make_term_query(toks, 'title')
1313
1314         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1315
1316     @staticmethod
1317     def chain_filters(filters, op=ChainedFilter.AND):
1318         """
1319         Chains a filter list together
1320         """
1321         filters = filter(lambda x: x is not None, filters)
1322         if not filters or filters is []:
1323             return None
1324         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1325         return chf
1326
1327     def filtered_categories(self, tags):
1328         """
1329         Return a list of tag categories, present in tags list.
1330         """
1331         cats = {}
1332         for t in tags:
1333             cats[t.category] = True
1334         return cats.keys()
1335
1336     def hint(self):
1337         return Hint(self)