more fixes
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from multiprocessing.pool import ThreadPool
31 from threading import current_thread
32 import atexit
33 import traceback
34
35
36 class WLAnalyzer(PerFieldAnalyzerWrapper):
37     def __init__(self):
38         polish = PolishAnalyzer(Version.LUCENE_34)
39         #        polish_gap.setPositionIncrementGap(999)
40
41         simple = SimpleAnalyzer(Version.LUCENE_34)
42         #        simple_gap.setPositionIncrementGap(999)
43
44         keyword = KeywordAnalyzer(Version.LUCENE_34)
45
46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
47
48         PerFieldAnalyzerWrapper.__init__(self, polish)
49
50         self.addAnalyzer("tags", simple)
51         self.addAnalyzer("technical_editors", simple)
52         self.addAnalyzer("editors", simple)
53         self.addAnalyzer("url", keyword)
54         self.addAnalyzer("source_url", keyword)
55         self.addAnalyzer("source_name", simple)
56         self.addAnalyzer("publisher", simple)
57         self.addAnalyzer("authors", simple)
58         self.addAnalyzer("title", simple)
59
60         self.addAnalyzer("is_book", keyword)
61         # shouldn't the title have two forms? _pl and simple?
62
63         self.addAnalyzer("themes", simple)
64         self.addAnalyzer("themes_pl", polish)
65
66         self.addAnalyzer("tag_name", simple)
67         self.addAnalyzer("tag_name_pl", polish)
68
69         self.addAnalyzer("translators", simple)
70
71         self.addAnalyzer("KEYWORD", keyword)
72         self.addAnalyzer("SIMPLE", simple)
73         self.addAnalyzer("POLISH", polish)
74
75
76 class IndexStore(object):
77     """
78     Provides access to search index.
79
80     self.store - lucene index directory
81     """
82     def __init__(self):
83         self.make_index_dir()
84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
85
86     def make_index_dir(self):
87         try:
88             os.makedirs(settings.SEARCH_INDEX)
89         except OSError as exc:
90             if exc.errno == errno.EEXIST:
91                 pass
92             else: raise
93
94
95 class IndexChecker(IndexStore):
96     def __init__(self):
97         IndexStore.__init__(self)
98
99     def check(self):
100         checker = CheckIndex(self.store)
101         status = checker.checkIndex()
102         return status
103
104
105 class Snippets(object):
106     """
107     This class manages snippet files for indexed object (book)
108     the snippets are concatenated together, and their positions and
109     lengths are kept in lucene index fields.
110     """
111     SNIPPET_DIR = "snippets"
112
113     def __init__(self, book_id):
114         try:
115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
116         except OSError as exc:
117             if exc.errno == errno.EEXIST:
118                 pass
119             else: raise
120         self.book_id = book_id
121         self.file = None
122
123     def open(self, mode='r'):
124         """
125         Open the snippet file. Call .close() afterwards.
126         """
127         if not 'b' in mode:
128             mode += 'b'
129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
130         self.position = 0
131         return self
132
133     def add(self, snippet):
134         """
135         Append a snippet (unicode) to the snippet file.
136         Return a (position, length) tuple
137         """
138         txt = snippet.encode('utf-8')
139         l = len(txt)
140         self.file.write(txt)
141         pos = (self.position, l)
142         self.position += l
143         return pos
144
145     def get(self, pos):
146         """
147         Given a tuple of (position, length) return an unicode
148         of the snippet stored there.
149         """
150         self.file.seek(pos[0], 0)
151         txt = self.file.read(pos[1]).decode('utf-8')
152         return txt
153
154     def close(self):
155         """Close snippet file"""
156         self.file.close()
157
158
159 class BaseIndex(IndexStore):
160     """
161     Base index class.
162     Provides basic operations on index: opening, closing, optimizing.
163     """
164     def __init__(self, analyzer=None):
165         super(BaseIndex, self).__init__()
166         self.index = None
167         if not analyzer:
168             analyzer = WLAnalyzer()
169         self.analyzer = analyzer
170
171     def open(self, analyzer=None):
172         if self.index:
173             raise Exception("Index is already opened")
174         self.index = IndexWriter(self.store, self.analyzer,\
175                                  IndexWriter.MaxFieldLength.LIMITED)
176         return self.index
177
178     def optimize(self):
179         self.index.optimize()
180
181     def close(self):
182         try:
183             self.index.optimize()
184         except JavaError, je:
185             print "Error during optimize phase, check index: %s" % je
186
187         self.index.close()
188         self.index = None
189
190     def __enter__(self):
191         self.open()
192         return self
193
194     def __exit__(self, type, value, tb):
195         self.close()
196
197
198 class Index(BaseIndex):
199     """
200     Class indexing books.
201     """
202     def __init__(self, analyzer=None):
203         super(Index, self).__init__(analyzer)
204
205     def index_tags(self):
206         """
207         Re-index global tag list.
208         Removes all tags from index, then index them again.
209         Indexed fields include: id, name (with and without polish stems), category
210         """
211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
212         self.index.deleteDocuments(q)
213
214         for tag in catalogue.models.Tag.objects.all():
215             doc = Document()
216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
220             self.index.addDocument(doc)
221
222     def create_book_doc(self, book):
223         """
224         Create a lucene document referring book id.
225         """
226         doc = Document()
227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
228         if book.parent is not None:
229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
230         return doc
231
232     def remove_book(self, book):
233         """Removes a book from search index.
234         book - Book instance."""
235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
236         self.index.deleteDocuments(q)
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             self.remove_book(book)
246
247         book_doc = self.create_book_doc(book)
248         meta_fields = self.extract_metadata(book, book_info)
249         for f in meta_fields.values():
250             if isinstance(f, list) or isinstance(f, tuple):
251                 for elem in f:
252                     book_doc.add(elem)
253             else:
254                 book_doc.add(f)
255
256         self.index.addDocument(book_doc)
257         del book_doc
258
259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
260
261     master_tags = [
262         'opowiadanie',
263         'powiesc',
264         'dramat_wierszowany_l',
265         'dramat_wierszowany_lp',
266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267         'wywiad',
268         ]
269
270     ignore_content_tags = [
271         'uwaga', 'extra',
272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
273         'didaskalia',
274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
275         ]
276
277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
278
279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
280
281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
282
283     def extract_metadata(self, book, book_info=None):
284         """
285         Extract metadata from book and returns a map of fields keyed by fieldname
286         """
287         fields = {}
288
289         if book_info is None:
290             book_info = dcparser.parse(open(book.xml_file.path))
291
292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
295
296         # validator, name
297         for field in dcparser.BookInfo.FIELDS:
298             if hasattr(book_info, field.name):
299                 if not getattr(book_info, field.name):
300                     continue
301                 # since no type information is available, we use validator
302                 type_indicator = field.validator
303                 if type_indicator == dcparser.as_unicode:
304                     s = getattr(book_info, field.name)
305                     if field.multiple:
306                         s = ', '.join(s)
307                     try:
308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
309                     except JavaError as je:
310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
311                 elif type_indicator == dcparser.as_person:
312                     p = getattr(book_info, field.name)
313                     if isinstance(p, dcparser.Person):
314                         persons = unicode(p)
315                     else:
316                         persons = ', '.join(map(unicode, p))
317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
318                 elif type_indicator == dcparser.as_date:
319                     dt = getattr(book_info, field.name)
320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
322
323         # get published date
324         source = book_info.source_name
325         match = self.published_date_re.search(source)
326         print("published date is %s %s" % (match, match is not None and match.groups()))
327         if match is not None:
328             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
329
330         return fields
331
332     def add_gaps(self, fields, fieldname):
333         """
334         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
335         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
336         """
337         def gap():
338             while True:
339                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
340         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
341
342     def get_master(self, root):
343         """
344         Returns the first master tag from an etree.
345         """
346         for master in root.iter():
347             if master.tag in self.master_tags:
348                 return master
349
350     def index_content(self, book, book_fields=[]):
351         """
352         Walks the book XML and extract content from it.
353         Adds parts for each header tag and for each fragment.
354         """
355         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
356         root = wld.edoc.getroot()
357
358         master = self.get_master(root)
359         if master is None:
360             return []
361
362         def walker(node, ignore_tags=[]):
363             yield node, None
364             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
365                 for b, e in walker(child):
366                     yield b, e
367             yield None, node
368             return
369
370         def fix_format(text):
371             #            separator = [u" ", u"\t", u".", u";", u","]
372             if isinstance(text, list):
373                 # need to join it first
374                 text = filter(lambda s: s is not None, content)
375                 text = u' '.join(text)
376                 # for i in range(len(text)):
377                 #     if i > 0:
378                 #         if text[i][0] not in separator\
379                 #             and text[i - 1][-1] not in separator:
380                 #          text.insert(i, u" ")
381
382             return re.sub("(?m)/$", "", text)
383
384         def add_part(snippets, **fields):
385             doc = self.create_book_doc(book)
386             for f in book_fields:
387                 doc.add(f)
388
389             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
390             doc.add(NumericField("header_span", Field.Store.YES, True)\
391                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
392             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
393
394             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
395                           Field.TermVector.WITH_POSITIONS_OFFSETS))
396
397             snip_pos = snippets.add(fields["content"])
398             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
399             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
400
401             if 'fragment_anchor' in fields:
402                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
403                               Field.Store.YES, Field.Index.NOT_ANALYZED))
404
405             if 'themes' in fields:
406                 themes, themes_pl = zip(*[
407                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
408                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
409                      for theme in fields['themes']])
410
411                 themes = self.add_gaps(themes, 'themes')
412                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
413
414                 for t in themes:
415                     doc.add(t)
416                 for t in themes_pl:
417                     doc.add(t)
418
419             return doc
420
421         def give_me_utf8(s):
422             if isinstance(s, unicode):
423                 return s.encode('utf-8')
424             else:
425                 return s
426
427         fragments = {}
428         snippets = Snippets(book.id).open('w')
429         try:
430             for header, position in zip(list(master), range(len(master))):
431
432                 if header.tag in self.skip_header_tags:
433                     continue
434                 if header.tag is etree.Comment:
435                     continue
436
437                 # section content
438                 content = []
439                 footnote = None
440
441                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
442                     # handle footnotes
443                     # if start is not None and start.tag in self.footnote_tags:
444                     #     footnote = ' '.join(start.itertext())
445                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
446                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
447                     #                    content=footnote)
448
449                     #     self.index.addDocument(doc)
450
451                     #     footnote = None
452
453                     # handle fragments and themes.
454                     if start is not None and start.tag == 'begin':
455                         fid = start.attrib['id'][1:]
456                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
457
458                     elif start is not None and start.tag == 'motyw':
459                         fid = start.attrib['id'][1:]
460                         if start.text is not None:
461                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
462
463                     elif start is not None and start.tag == 'end':
464                         fid = start.attrib['id'][1:]
465                         if fid not in fragments:
466                             continue  # a broken <end> node, skip it
467                                       #                        import pdb; pdb.set_trace()
468                         frag = fragments[fid]
469                         if frag['themes'] == []:
470                             continue  # empty themes list.
471                         del fragments[fid]
472
473                         doc = add_part(snippets,
474                                        header_type=frag['start_header'],
475                                        header_index=frag['start_section'],
476                                        header_span=position - frag['start_section'] + 1,
477                                        fragment_anchor=fid,
478                                        content=fix_format(frag['content']),
479                                        themes=frag['themes'])
480
481                         self.index.addDocument(doc)
482
483                         # Collect content.
484                     elif start is not None:
485                         for frag in fragments.values():
486                             frag['content'].append(start.text)
487                         content.append(start.text)
488                     elif end is not None:
489                         for frag in fragments.values():
490                             frag['content'].append(end.tail)
491                         content.append(end.tail)
492
493                         # in the end, add a section text.
494                 doc = add_part(snippets, header_index=position, header_type=header.tag,
495                                content=fix_format(content))
496
497                 self.index.addDocument(doc)
498
499         finally:
500             snippets.close()
501
502
503 def log_exception_wrapper(f):
504     def _wrap(*a):
505         try:
506             f(*a)
507         except Exception, e:
508             print("Error in indexing thread: %s" % e)
509             traceback.print_exc()
510             raise e
511     return _wrap
512
513
514 class ReusableIndex(Index):
515     """
516     Works like index, but does not close/optimize Lucene index
517     until program exit (uses atexit hook).
518     This is usefull for importbooks command.
519
520     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
521     """
522     index = None
523
524     def open(self, analyzer=None, threads=4):
525         if ReusableIndex.index is not None:
526             self.index = ReusableIndex.index
527         else:
528             print("opening index")
529             Index.open(self, analyzer)
530             ReusableIndex.index = self.index
531             atexit.register(ReusableIndex.close_reusable)
532
533     # def index_book(self, *args, **kw):
534     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
535     #     ReusableIndex.pool_jobs.append(job)
536
537     @staticmethod
538     def close_reusable():
539         if ReusableIndex.index is not None:
540             ReusableIndex.index.optimize()
541             ReusableIndex.index.close()
542             ReusableIndex.index = None
543
544     def close(self):
545         pass
546
547
548 class JoinSearch(object):
549     """
550     This mixin could be used to handle block join queries.
551     (currently unused)
552     """
553     def __init__(self, *args, **kw):
554         super(JoinSearch, self).__init__(*args, **kw)
555
556     def wrapjoins(self, query, fields=[]):
557         """
558         This functions modifies the query in a recursive way,
559         so Term and Phrase Queries contained, which match
560         provided fields are wrapped in a BlockJoinQuery,
561         and so delegated to children documents.
562         """
563         if BooleanQuery.instance_(query):
564             qs = BooleanQuery.cast_(query)
565             for clause in qs:
566                 clause = BooleanClause.cast_(clause)
567                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
568             return qs
569         else:
570             termset = HashSet()
571             query.extractTerms(termset)
572             for t in termset:
573                 t = Term.cast_(t)
574                 if t.field() not in fields:
575                     return query
576             return BlockJoinQuery(query, self.parent_filter,
577                                   BlockJoinQuery.ScoreMode.Total)
578
579     def bsearch(self, query, max_results=50):
580         q = self.query(query)
581         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
582
583         tops = self.searcher.search(bjq, max_results)
584         bks = []
585         for found in tops.scoreDocs:
586             doc = self.searcher.doc(found.doc)
587             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
588         return (bks, tops.totalHits)
589
590
591 class SearchResult(object):
592     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
593         if tokens_cache is None: tokens_cache = {}
594
595         if score:
596             self._score = score
597         else:
598             self._score = scoreDocs.score
599
600         self.boost = 1.0
601
602         self._hits = []
603         self._processed_hits = None  # processed hits
604
605         stored = search.searcher.doc(scoreDocs.doc)
606         self.book_id = int(stored.get("book_id"))
607
608         header_type = stored.get("header_type")
609         if not header_type:
610             return
611
612         sec = (header_type, int(stored.get("header_index")))
613         header_span = stored.get('header_span')
614         header_span = header_span is not None and int(header_span) or 1
615
616         fragment = stored.get("fragment_anchor")
617
618         pd = stored.get("published_date")
619         if pd is None:
620             print "published_date is none for book %d" % self.book_id
621             pd = 0
622         self.published_date = int(pd)
623
624         if snippets:
625             snippets = snippets.replace("/\n", "\n")
626         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
627
628         self._hits.append(hit)
629
630         self.search = search
631         self.searched = searched
632         self.tokens_cache = tokens_cache
633
634     @property
635     def score(self):
636         return self._score * self.boost
637
638     def merge(self, other):
639         if self.book_id != other.book_id:
640             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
641         self._hits += other._hits
642         if other.score > self.score:
643             self._score = other._score
644         return self
645
646     def get_book(self):
647         return catalogue.models.Book.objects.get(id=self.book_id)
648
649     book = property(get_book)
650
651     @property
652     def hits(self):
653         if self._processed_hits is not None:
654             return self._processed_hits
655
656         POSITION = 0
657         FRAGMENT = 1
658         POSITION_INDEX = 1
659         POSITION_SPAN = 2
660         SCORE = 2
661         OTHER = 3
662
663         # to sections and fragments
664         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
665         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
666         sect = filter(lambda s: 0 == len(filter(
667             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
668             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
669             frags)), sect)
670
671         hits = []
672
673         # remove duplicate fragments
674         fragments = {}
675         for f in frags:
676             fid = f[FRAGMENT]
677             if fid in fragments:
678                 if fragments[fid][SCORE] >= f[SCORE]:
679                     continue
680             fragments[fid] = f
681         frags = fragments.values()
682
683         # remove duplicate sections
684         sections = {}
685
686         for s in sect:
687             si = s[POSITION][POSITION_INDEX]
688             # skip existing
689             if si in sections:
690                 if sections[si]['score'] >= s[SCORE]:
691                     continue
692
693             m = {'score': s[SCORE],
694                  'section_number': s[POSITION][POSITION_INDEX] + 1,
695                  }
696             m.update(s[OTHER])
697             sections[si] = m
698
699         hits = sections.values()
700
701         for f in frags:
702             try:
703                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
704             except catalogue.models.Fragment.DoesNotExist:
705                 # stale index
706                 continue
707
708             # Figure out if we were searching for a token matching some word in theme name.
709             themes = frag.tags.filter(category='theme')
710             themes_hit = []
711             if self.searched is not None:
712                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
713                 for theme in themes:
714                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
715                     for t in tokens:
716                         if t in name_tokens:
717                             if not theme in themes_hit:
718                                 themes_hit.append(theme)
719                             break
720
721             m = {'score': f[SCORE],
722                  'fragment': frag,
723                  'section_number': f[POSITION][POSITION_INDEX] + 1,
724                  'themes': themes,
725                  'themes_hit': themes_hit
726                  }
727             m.update(f[OTHER])
728             hits.append(m)
729
730         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
731
732         self._processed_hits = hits
733
734         return hits
735
736     def __unicode__(self):
737         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
738
739     @staticmethod
740     def aggregate(*result_lists):
741         books = {}
742         for rl in result_lists:
743             for r in rl:
744                 if r.book_id in books:
745                     books[r.book_id].merge(r)
746                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
747                 else:
748                     books[r.book_id] = r
749         return books.values()
750
751     def __cmp__(self, other):
752         c = cmp(self.score, other.score)
753         if c == 0:
754             # this is inverted, because earlier date is better
755             return cmp(other.published_date, self.published_date)
756         else:
757             return c
758
759
760 class Hint(object):
761     """
762     Given some hint information (information we already know about)
763     our search target - like author, title (specific book), epoch, genre, kind
764     we can narrow down search using filters.
765     """
766     def __init__(self, search):
767         """
768         Accepts a Searcher instance.
769         """
770         self.search = search
771         self.book_tags = {}
772         self.part_tags = []
773         self._books = []
774
775     def books(self, *books):
776         """
777         Give a hint that we search these books.
778         """
779         self._books = books
780
781     def tags(self, tags):
782         """
783         Give a hint that these Tag objects (a list of)
784         is necessary.
785         """
786         for t in tags:
787             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
788                 lst = self.book_tags.get(t.category, [])
789                 lst.append(t)
790                 self.book_tags[t.category] = lst
791             if t.category in ['theme', 'theme_pl']:
792                 self.part_tags.append(t)
793
794     def tag_filter(self, tags, field='tags'):
795         """
796         Given a lsit of tags and an optional field (but they are normally in tags field)
797         returns a filter accepting only books with specific tags.
798         """
799         q = BooleanQuery()
800
801         for tag in tags:
802             toks = self.search.get_tokens(tag.name, field=field)
803             tag_phrase = PhraseQuery()
804             for tok in toks:
805                 tag_phrase.add(Term(field, tok))
806             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
807
808         return QueryWrapperFilter(q)
809
810     def book_filter(self):
811         """
812         Filters using book tags (all tag kinds except a theme)
813         """
814         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
815         if tags:
816             return self.tag_filter(tags)
817         else:
818             return None
819
820     def part_filter(self):
821         """
822         This filter can be used to look for book parts.
823         It filters on book id and/or themes.
824         """
825         fs = []
826         if self.part_tags:
827             fs.append(self.tag_filter(self.part_tags, field='themes'))
828
829         if self._books != []:
830             bf = BooleanFilter()
831             for b in self._books:
832                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
833                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
834             fs.append(bf)
835
836         return Search.chain_filters(fs)
837
838     def should_search_for_book(self):
839         return self._books == []
840
841     def just_search_in(self, all):
842         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
843         some = []
844         for field in all:
845             if field == 'authors' and 'author' in self.book_tags:
846                 continue
847             if field == 'title' and self._books != []:
848                 continue
849             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
850                 continue
851             some.append(field)
852         return some
853
854
855 class Search(IndexStore):
856     """
857     Search facilities.
858     """
859     def __init__(self, default_field="content"):
860         IndexStore.__init__(self)
861         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
862         # self.analyzer = WLAnalyzer()
863         self.searcher = IndexSearcher(self.store, True)
864         self.parser = QueryParser(Version.LUCENE_34, default_field,
865                                   self.analyzer)
866
867         self.parent_filter = TermsFilter()
868         self.parent_filter.addTerm(Term("is_book", "true"))
869
870     def query(self, query):
871         """Parse query in default Lucene Syntax. (for humans)
872         """
873         return self.parser.parse(query)
874
875     def simple_search(self, query, max_results=50):
876         """Runs a query for books using lucene syntax. (for humans)
877         Returns (books, total_hits)
878         """
879
880         tops = self.searcher.search(self.query(query), max_results)
881         bks = []
882         for found in tops.scoreDocs:
883             doc = self.searcher.doc(found.doc)
884             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
885         return (bks, tops.totalHits)
886
887     def get_tokens(self, searched, field='content', cached=None):
888         """returns tokens analyzed by a proper (for a field) analyzer
889         argument can be: StringReader, string/unicode, or tokens. In the last case
890         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
891         """
892         if cached is not None and field in cached:
893             return cached[field]
894
895         if isinstance(searched, str) or isinstance(searched, unicode):
896             searched = StringReader(searched)
897         elif isinstance(searched, list):
898             return searched
899
900         searched.reset()
901         tokens = self.analyzer.reusableTokenStream(field, searched)
902         toks = []
903         while tokens.incrementToken():
904             cta = tokens.getAttribute(CharTermAttribute.class_)
905             toks.append(cta.toString())
906
907         if cached is not None:
908             cached[field] = toks
909
910         return toks
911
912     def fuzziness(self, fuzzy):
913         """Helper method to sanitize fuzziness"""
914         if not fuzzy:
915             return None
916         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
917             return fuzzy
918         else:
919             return 0.5
920
921     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
922         """
923         Return a PhraseQuery with a series of tokens.
924         """
925         if fuzzy:
926             phrase = MultiPhraseQuery()
927             for t in tokens:
928                 term = Term(field, t)
929                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
930                 fuzzterms = []
931
932                 while True:
933                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
934                     ft = fuzzterm.term()
935                     if ft:
936                         fuzzterms.append(ft)
937                     if not fuzzterm.next(): break
938                 if fuzzterms:
939                     phrase.add(JArray('object')(fuzzterms, Term))
940                 else:
941                     phrase.add(term)
942         else:
943             phrase = PhraseQuery()
944             phrase.setSlop(slop)
945             for t in tokens:
946                 term = Term(field, t)
947                 phrase.add(term)
948         return phrase
949
950     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
951         """
952         Returns term queries joined by boolean query.
953         modal - applies to boolean query
954         fuzzy - should the query by fuzzy.
955         """
956         q = BooleanQuery()
957         for t in tokens:
958             term = Term(field, t)
959             if fuzzy:
960                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
961             else:
962                 term = TermQuery(term)
963             q.add(BooleanClause(term, modal))
964         return q
965
966     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
967                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
968         if filters is None: filters = []
969         if tokens_cache is None: tokens_cache = {}
970
971         tokens = self.get_tokens(searched, field, cached=tokens_cache)
972
973         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
974         if book:
975             filters.append(self.term_filter(Term('is_book', 'true')))
976         top = self.searcher.search(query, self.chain_filters(filters), max_results)
977
978         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
979
980     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
981                     filters=None, tokens_cache=None, boost=None, snippets=True):
982         if filters is None: filters = []
983         if tokens_cache is None: tokens_cache = {}
984
985         if book:
986             filters.append(self.term_filter(Term('is_book', 'true')))
987
988         query = BooleanQuery()
989
990         for fld in fields:
991             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
992
993             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
994                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
995
996         top = self.searcher.search(query, self.chain_filters(filters), max_results)
997
998         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
999                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1000
1001     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1002         """
1003         Search for perfect book matches. Just see if the query matches with some author or title,
1004         taking hints into account.
1005         """
1006         fields_to_search = ['authors', 'title']
1007         only_in = None
1008         if hint:
1009             if not hint.should_search_for_book():
1010                 return []
1011             fields_to_search = hint.just_search_in(fields_to_search)
1012             only_in = hint.book_filter()
1013
1014         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1015
1016         books = []
1017         for q in qrys:
1018             top = self.searcher.search(q,
1019                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1020                 max_results)
1021             for found in top.scoreDocs:
1022                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1023         return books
1024
1025     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1026         fields_to_search = ['tags', 'authors', 'title']
1027
1028         only_in = None
1029         if hint:
1030             if not hint.should_search_for_book():
1031                 return []
1032             fields_to_search = hint.just_search_in(fields_to_search)
1033             only_in = hint.book_filter()
1034
1035         tokens = self.get_tokens(searched, field='SIMPLE')
1036
1037         q = BooleanQuery()
1038
1039         for fld in fields_to_search:
1040             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1041                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1042
1043         books = []
1044         top = self.searcher.search(q,
1045                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1046             max_results)
1047         for found in top.scoreDocs:
1048             books.append(SearchResult(self, found, how_found="search_book"))
1049
1050         return books
1051
1052     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1053         """
1054         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1055         some part/fragment of the book.
1056         """
1057         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1058
1059         flt = None
1060         if hint:
1061             flt = hint.part_filter()
1062
1063         books = []
1064         for q in qrys:
1065             top = self.searcher.search(q,
1066                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1067                                                            flt]),
1068                                        max_results)
1069             for found in top.scoreDocs:
1070                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1071
1072         return books
1073
1074     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1075         """
1076         Tries to use search terms to match different fields of book (or its parts).
1077         E.g. one word can be an author survey, another be a part of the title, and the rest
1078         are some words from third chapter.
1079         """
1080         if tokens_cache is None: tokens_cache = {}
1081         books = []
1082         only_in = None
1083
1084         if hint:
1085             only_in = hint.part_filter()
1086
1087         # content only query : themes x content
1088         q = BooleanQuery()
1089
1090         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1091         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1092
1093         # only search in themes when we do not already filter by themes
1094         if hint is None or hint.just_search_in(['themes']) != []:
1095             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1096                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1097
1098         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1099                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1100
1101         topDocs = self.searcher.search(q, only_in, max_results)
1102         for found in topDocs.scoreDocs:
1103             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1104             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1105
1106         # query themes/content x author/title/tags
1107         q = BooleanQuery()
1108         in_content = BooleanQuery()
1109         in_meta = BooleanQuery()
1110
1111         for fld in ['themes_pl', 'content']:
1112             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1113
1114         for fld in ['tags', 'authors', 'title']:
1115             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1116
1117         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1118         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1119
1120         topDocs = self.searcher.search(q, only_in, max_results)
1121         for found in topDocs.scoreDocs:
1122             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1123             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1124
1125         return books
1126
1127     # def multisearch(self, query, max_results=50):
1128     #     """
1129     #     Search strategy:
1130     #     - (phrase) OR -> content
1131     #                   -> title
1132     #                   -> authors
1133     #     - (keywords)  -> authors
1134     #                   -> motyw
1135     #                   -> tags
1136     #                   -> content
1137     #     """
1138         # queryreader = StringReader(query)
1139         # tokens = self.get_tokens(queryreader)
1140
1141         # top_level = BooleanQuery()
1142         # Should = BooleanClause.Occur.SHOULD
1143
1144         # phrase_level = BooleanQuery()
1145         # phrase_level.setBoost(1.3)
1146
1147         # p_content = self.make_phrase(tokens, joined=True)
1148         # p_title = self.make_phrase(tokens, 'title')
1149         # p_author = self.make_phrase(tokens, 'author')
1150
1151         # phrase_level.add(BooleanClause(p_content, Should))
1152         # phrase_level.add(BooleanClause(p_title, Should))
1153         # phrase_level.add(BooleanClause(p_author, Should))
1154
1155         # kw_level = BooleanQuery()
1156
1157         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1158         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1159         # kw_level.add(j_themes, Should)
1160         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1161         # j_con = self.make_term_query(tokens, joined=True)
1162         # kw_level.add(j_con, Should)
1163
1164         # top_level.add(BooleanClause(phrase_level, Should))
1165         # top_level.add(BooleanClause(kw_level, Should))
1166
1167         # return None
1168
1169     def get_snippets(self, scoreDoc, query, field='content'):
1170         """
1171         Returns a snippet for found scoreDoc.
1172         """
1173         htmlFormatter = SimpleHTMLFormatter()
1174         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1175
1176         stored = self.searcher.doc(scoreDoc.doc)
1177
1178         position = stored.get('snippets_position')
1179         length = stored.get('snippets_length')
1180         if position is None or length is None:
1181             return None
1182         # locate content.
1183         snippets = Snippets(stored.get('book_id')).open()
1184         try:
1185             text = snippets.get((int(position),
1186                                  int(length)))
1187         finally:
1188             snippets.close()
1189
1190         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1191         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1192         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1193
1194         return snip
1195
1196     @staticmethod
1197     def enum_to_array(enum):
1198         """
1199         Converts a lucene TermEnum to array of Terms, suitable for
1200         addition to queries
1201         """
1202         terms = []
1203
1204         while True:
1205             t = enum.term()
1206             if t:
1207                 terms.append(t)
1208             if not enum.next(): break
1209
1210         if terms:
1211             return JArray('object')(terms, Term)
1212
1213     def search_tags(self, query, filter=None, max_results=40):
1214         """
1215         Search for Tag objects using query.
1216         """
1217         tops = self.searcher.search(query, filter, max_results)
1218
1219         tags = []
1220         for found in tops.scoreDocs:
1221             doc = self.searcher.doc(found.doc)
1222             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1223             tags.append(tag)
1224             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1225
1226         return tags
1227
1228     def search_books(self, query, filter=None, max_results=10):
1229         """
1230         Searches for Book objects using query
1231         """
1232         bks = []
1233         tops = self.searcher.search(query, filter, max_results)
1234         for found in tops.scoreDocs:
1235             doc = self.searcher.doc(found.doc)
1236             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1237         return bks
1238
1239     def create_prefix_phrase(self, toks, field):
1240         q = MultiPhraseQuery()
1241         for i in range(len(toks)):
1242             t = Term(field, toks[i])
1243             if i == len(toks) - 1:
1244                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1245                 if pterms:
1246                     q.add(pterms)
1247                 else:
1248                     q.add(t)
1249             else:
1250                 q.add(t)
1251         return q
1252
1253     @staticmethod
1254     def term_filter(term, inverse=False):
1255         only_term = TermsFilter()
1256         only_term.addTerm(term)
1257
1258         if inverse:
1259             neg = BooleanFilter()
1260             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1261             only_term = neg
1262
1263         return only_term
1264
1265     def hint_tags(self, string, max_results=50):
1266         """
1267         Return auto-complete hints for tags
1268         using prefix search.
1269         """
1270         toks = self.get_tokens(string, field='SIMPLE')
1271         top = BooleanQuery()
1272
1273         for field in ['tag_name', 'tag_name_pl']:
1274             q = self.create_prefix_phrase(toks, field)
1275             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1276
1277         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1278
1279         return self.search_tags(top, no_book_cat, max_results=max_results)
1280
1281     def hint_books(self, string, max_results=50):
1282         """
1283         Returns auto-complete hints for book titles
1284         Because we do not index 'pseudo' title-tags.
1285         Prefix search.
1286         """
1287         toks = self.get_tokens(string, field='SIMPLE')
1288
1289         q = self.create_prefix_phrase(toks, 'title')
1290
1291         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1292
1293     @staticmethod
1294     def chain_filters(filters, op=ChainedFilter.AND):
1295         """
1296         Chains a filter list together
1297         """
1298         filters = filter(lambda x: x is not None, filters)
1299         if not filters or filters is []:
1300             return None
1301         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1302         return chf
1303
1304     def filtered_categories(self, tags):
1305         """
1306         Return a list of tag categories, present in tags list.
1307         """
1308         cats = {}
1309         for t in tags:
1310             cats[t.category] = True
1311         return cats.keys()
1312
1313     def hint(self):
1314         return Hint(self)