footnotes
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from multiprocessing.pool import ThreadPool
31 from threading import current_thread
32 import atexit
33 import traceback
34
35
36 class WLAnalyzer(PerFieldAnalyzerWrapper):
37     def __init__(self):
38         polish = PolishAnalyzer(Version.LUCENE_34)
39         #        polish_gap.setPositionIncrementGap(999)
40
41         simple = SimpleAnalyzer(Version.LUCENE_34)
42         #        simple_gap.setPositionIncrementGap(999)
43
44         keyword = KeywordAnalyzer(Version.LUCENE_34)
45
46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
47
48         PerFieldAnalyzerWrapper.__init__(self, polish)
49
50         self.addAnalyzer("tags", simple)
51         self.addAnalyzer("technical_editors", simple)
52         self.addAnalyzer("editors", simple)
53         self.addAnalyzer("url", keyword)
54         self.addAnalyzer("source_url", keyword)
55         self.addAnalyzer("source_name", simple)
56         self.addAnalyzer("publisher", simple)
57         self.addAnalyzer("authors", simple)
58         self.addAnalyzer("title", simple)
59
60         self.addAnalyzer("is_book", keyword)
61         # shouldn't the title have two forms? _pl and simple?
62
63         self.addAnalyzer("themes", simple)
64         self.addAnalyzer("themes_pl", polish)
65
66         self.addAnalyzer("tag_name", simple)
67         self.addAnalyzer("tag_name_pl", polish)
68
69         self.addAnalyzer("translators", simple)
70
71         self.addAnalyzer("KEYWORD", keyword)
72         self.addAnalyzer("SIMPLE", simple)
73         self.addAnalyzer("POLISH", polish)
74
75
76 class IndexStore(object):
77     """
78     Provides access to search index.
79
80     self.store - lucene index directory
81     """
82     def __init__(self):
83         self.make_index_dir()
84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
85
86     def make_index_dir(self):
87         try:
88             os.makedirs(settings.SEARCH_INDEX)
89         except OSError as exc:
90             if exc.errno == errno.EEXIST:
91                 pass
92             else: raise
93
94
95 class IndexChecker(IndexStore):
96     def __init__(self):
97         IndexStore.__init__(self)
98
99     def check(self):
100         checker = CheckIndex(self.store)
101         status = checker.checkIndex()
102         return status
103
104
105 class Snippets(object):
106     """
107     This class manages snippet files for indexed object (book)
108     the snippets are concatenated together, and their positions and
109     lengths are kept in lucene index fields.
110     """
111     SNIPPET_DIR = "snippets"
112
113     def __init__(self, book_id):
114         try:
115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
116         except OSError as exc:
117             if exc.errno == errno.EEXIST:
118                 pass
119             else: raise
120         self.book_id = book_id
121         self.file = None
122
123     def open(self, mode='r'):
124         """
125         Open the snippet file. Call .close() afterwards.
126         """
127         if not 'b' in mode:
128             mode += 'b'
129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
130         self.position = 0
131         return self
132
133     def add(self, snippet):
134         """
135         Append a snippet (unicode) to the snippet file.
136         Return a (position, length) tuple
137         """
138         txt = snippet.encode('utf-8')
139         l = len(txt)
140         self.file.write(txt)
141         pos = (self.position, l)
142         self.position += l
143         return pos
144
145     def get(self, pos):
146         """
147         Given a tuple of (position, length) return an unicode
148         of the snippet stored there.
149         """
150         self.file.seek(pos[0], 0)
151         txt = self.file.read(pos[1]).decode('utf-8')
152         return txt
153
154     def close(self):
155         """Close snippet file"""
156         self.file.close()
157
158
159 class BaseIndex(IndexStore):
160     """
161     Base index class.
162     Provides basic operations on index: opening, closing, optimizing.
163     """
164     def __init__(self, analyzer=None):
165         super(BaseIndex, self).__init__()
166         self.index = None
167         if not analyzer:
168             analyzer = WLAnalyzer()
169         self.analyzer = analyzer
170
171     def open(self, analyzer=None):
172         if self.index:
173             raise Exception("Index is already opened")
174         self.index = IndexWriter(self.store, self.analyzer,\
175                                  IndexWriter.MaxFieldLength.LIMITED)
176         return self.index
177
178     def optimize(self):
179         self.index.optimize()
180
181     def close(self):
182         try:
183             self.index.optimize()
184         except JavaError, je:
185             print "Error during optimize phase, check index: %s" % je
186
187         self.index.close()
188         self.index = None
189
190     def __enter__(self):
191         self.open()
192         return self
193
194     def __exit__(self, type, value, tb):
195         self.close()
196
197
198 class Index(BaseIndex):
199     """
200     Class indexing books.
201     """
202     def __init__(self, analyzer=None):
203         super(Index, self).__init__(analyzer)
204
205     def index_tags(self):
206         """
207         Re-index global tag list.
208         Removes all tags from index, then index them again.
209         Indexed fields include: id, name (with and without polish stems), category
210         """
211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
212         self.index.deleteDocuments(q)
213
214         for tag in catalogue.models.Tag.objects.all():
215             doc = Document()
216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
220             self.index.addDocument(doc)
221
222     def create_book_doc(self, book):
223         """
224         Create a lucene document referring book id.
225         """
226         doc = Document()
227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
228         if book.parent is not None:
229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
230         return doc
231
232     def remove_book(self, book):
233         """Removes a book from search index.
234         book - Book instance."""
235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
236         self.index.deleteDocuments(q)
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             self.remove_book(book)
246
247         book_doc = self.create_book_doc(book)
248         meta_fields = self.extract_metadata(book, book_info)
249         for f in meta_fields.values():
250             if isinstance(f, list) or isinstance(f, tuple):
251                 for elem in f:
252                     book_doc.add(elem)
253             else:
254                 book_doc.add(f)
255
256         self.index.addDocument(book_doc)
257         del book_doc
258
259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
260
261     master_tags = [
262         'opowiadanie',
263         'powiesc',
264         'dramat_wierszowany_l',
265         'dramat_wierszowany_lp',
266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267         'wywiad',
268         ]
269
270     ignore_content_tags = [
271         'uwaga', 'extra',
272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
273         'didaskalia',
274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
275         ]
276
277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
278
279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
280
281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
282
283     def extract_metadata(self, book, book_info=None):
284         """
285         Extract metadata from book and returns a map of fields keyed by fieldname
286         """
287         fields = {}
288
289         if book_info is None:
290             book_info = dcparser.parse(open(book.xml_file.path))
291
292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
295
296         # validator, name
297         for field in dcparser.BookInfo.FIELDS:
298             if hasattr(book_info, field.name):
299                 if not getattr(book_info, field.name):
300                     continue
301                 # since no type information is available, we use validator
302                 type_indicator = field.validator
303                 if type_indicator == dcparser.as_unicode:
304                     s = getattr(book_info, field.name)
305                     if field.multiple:
306                         s = ', '.join(s)
307                     try:
308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
309                     except JavaError as je:
310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
311                 elif type_indicator == dcparser.as_person:
312                     p = getattr(book_info, field.name)
313                     if isinstance(p, dcparser.Person):
314                         persons = unicode(p)
315                     else:
316                         persons = ', '.join(map(unicode, p))
317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
318                 elif type_indicator == dcparser.as_date:
319                     dt = getattr(book_info, field.name)
320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
322
323         # get published date
324         source = book_info.source_name
325         match = self.published_date_re.search(source)
326         if match is not None:
327             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
328
329         return fields
330
331     def add_gaps(self, fields, fieldname):
332         """
333         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
334         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
335         """
336         def gap():
337             while True:
338                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
339         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
340
341     def get_master(self, root):
342         """
343         Returns the first master tag from an etree.
344         """
345         for master in root.iter():
346             if master.tag in self.master_tags:
347                 return master
348
349     def index_content(self, book, book_fields=[]):
350         """
351         Walks the book XML and extract content from it.
352         Adds parts for each header tag and for each fragment.
353         """
354         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
355         root = wld.edoc.getroot()
356
357         master = self.get_master(root)
358         if master is None:
359             return []
360
361         def walker(node, ignore_tags=[]):
362
363             if node.tag not in ignore_tags:
364                 yield node, None, None
365                 if node.text is not None:
366                     yield None, node.text, None
367                 for child in list(node):
368                     for b, t, e in walker(child):
369                         yield b, t, e
370                 yield None, None, node
371
372             if node.tail is not None:
373                 yield None, node.tail, None
374             return
375
376         def fix_format(text):
377             #            separator = [u" ", u"\t", u".", u";", u","]
378             if isinstance(text, list):
379                 # need to join it first
380                 text = filter(lambda s: s is not None, content)
381                 text = u' '.join(text)
382                 # for i in range(len(text)):
383                 #     if i > 0:
384                 #         if text[i][0] not in separator\
385                 #             and text[i - 1][-1] not in separator:
386                 #          text.insert(i, u" ")
387
388             return re.sub("(?m)/$", "", text)
389
390         def add_part(snippets, **fields):
391             doc = self.create_book_doc(book)
392             for f in book_fields:
393                 doc.add(f)
394
395             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
396             doc.add(NumericField("header_span", Field.Store.YES, True)\
397                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
398             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
399
400             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
401                           Field.TermVector.WITH_POSITIONS_OFFSETS))
402
403             snip_pos = snippets.add(fields["content"])
404             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
405             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
406
407             if 'fragment_anchor' in fields:
408                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
409                               Field.Store.YES, Field.Index.NOT_ANALYZED))
410
411             if 'themes' in fields:
412                 themes, themes_pl = zip(*[
413                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
414                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
415                      for theme in fields['themes']])
416
417                 themes = self.add_gaps(themes, 'themes')
418                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
419
420                 for t in themes:
421                     doc.add(t)
422                 for t in themes_pl:
423                     doc.add(t)
424
425             return doc
426
427         def give_me_utf8(s):
428             if isinstance(s, unicode):
429                 return s.encode('utf-8')
430             else:
431                 return s
432
433         fragments = {}
434         snippets = Snippets(book.id).open('w')
435         try:
436             for header, position in zip(list(master), range(len(master))):
437
438                 if header.tag in self.skip_header_tags:
439                     continue
440                 if header.tag is etree.Comment:
441                     continue
442
443                 # section content
444                 content = []
445                 footnote = []
446
447                 def all_content(text):
448                     for frag in fragments.values():
449                         frag['content'].append(text)
450                     content.append(text)
451                 handle_text = [all_content]
452
453
454                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
455                     # handle footnotes
456                     if start is not None and start.tag in self.footnote_tags:
457                         footnote = []
458                         def collect_footnote(t):
459                             footnote.append(t)
460                         handle_text.append(collect_footnote)
461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
462                         handle_text.pop()
463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
464                                        content=u''.join(footnote),
465                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
466                 
467                         self.index.addDocument(doc)
468                         print "@ footnote text: %s" % footnote
469                         footnote = []
470                     
471                     # handle fragments and themes.
472                     if start is not None and start.tag == 'begin':
473                         fid = start.attrib['id'][1:]
474                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
475
476                     # themes for this fragment
477                     elif start is not None and start.tag == 'motyw':
478                         fid = start.attrib['id'][1:]
479                         handle_text.append(None)
480                         if start.text is not None:
481                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
482                     elif end is not None and end.tag == 'motyw':
483                         handle_text.pop()
484
485                     elif start is not None and start.tag == 'end':
486                         fid = start.attrib['id'][1:]
487                         if fid not in fragments:
488                             continue  # a broken <end> node, skip it
489                         frag = fragments[fid]
490                         if frag['themes'] == []:
491                             continue  # empty themes list.
492                         del fragments[fid]
493
494                         doc = add_part(snippets,
495                                        header_type=frag['start_header'],
496                                        header_index=frag['start_section'],
497                                        header_span=position - frag['start_section'] + 1,
498                                        fragment_anchor=fid,
499                                        content=fix_format(frag['content']),
500                                        themes=frag['themes'])
501                         print '@ FRAG %s' % frag['content']
502                         self.index.addDocument(doc)
503
504                         # Collect content.
505
506                     if text is not None and handle_text is not []:
507                         hdl = handle_text[-1]
508                         if hdl is not None:
509                             hdl(text)
510
511                         # in the end, add a section text.
512                 doc = add_part(snippets, header_index=position, header_type=header.tag,
513                                content=fix_format(content))
514                 print '@ CONTENT: %s' % fix_format(content)
515
516                 self.index.addDocument(doc)
517
518         finally:
519             snippets.close()
520
521
522 def log_exception_wrapper(f):
523     def _wrap(*a):
524         try:
525             f(*a)
526         except Exception, e:
527             print("Error in indexing thread: %s" % e)
528             traceback.print_exc()
529             raise e
530     return _wrap
531
532
533 class ReusableIndex(Index):
534     """
535     Works like index, but does not close/optimize Lucene index
536     until program exit (uses atexit hook).
537     This is usefull for importbooks command.
538
539     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
540     """
541     index = None
542
543     def open(self, analyzer=None, threads=4):
544         if ReusableIndex.index is not None:
545             self.index = ReusableIndex.index
546         else:
547             print("opening index")
548             Index.open(self, analyzer)
549             ReusableIndex.index = self.index
550             atexit.register(ReusableIndex.close_reusable)
551
552     # def index_book(self, *args, **kw):
553     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
554     #     ReusableIndex.pool_jobs.append(job)
555
556     @staticmethod
557     def close_reusable():
558         if ReusableIndex.index is not None:
559             ReusableIndex.index.optimize()
560             ReusableIndex.index.close()
561             ReusableIndex.index = None
562
563     def close(self):
564         pass
565
566
567 class JoinSearch(object):
568     """
569     This mixin could be used to handle block join queries.
570     (currently unused)
571     """
572     def __init__(self, *args, **kw):
573         super(JoinSearch, self).__init__(*args, **kw)
574
575     def wrapjoins(self, query, fields=[]):
576         """
577         This functions modifies the query in a recursive way,
578         so Term and Phrase Queries contained, which match
579         provided fields are wrapped in a BlockJoinQuery,
580         and so delegated to children documents.
581         """
582         if BooleanQuery.instance_(query):
583             qs = BooleanQuery.cast_(query)
584             for clause in qs:
585                 clause = BooleanClause.cast_(clause)
586                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
587             return qs
588         else:
589             termset = HashSet()
590             query.extractTerms(termset)
591             for t in termset:
592                 t = Term.cast_(t)
593                 if t.field() not in fields:
594                     return query
595             return BlockJoinQuery(query, self.parent_filter,
596                                   BlockJoinQuery.ScoreMode.Total)
597
598     def bsearch(self, query, max_results=50):
599         q = self.query(query)
600         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
601
602         tops = self.searcher.search(bjq, max_results)
603         bks = []
604         for found in tops.scoreDocs:
605             doc = self.searcher.doc(found.doc)
606             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
607         return (bks, tops.totalHits)
608
609
610 class SearchResult(object):
611     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
612         if tokens_cache is None: tokens_cache = {}
613
614         if score:
615             self._score = score
616         else:
617             self._score = scoreDocs.score
618
619         self.boost = 1.0
620
621         self._hits = []
622         self._processed_hits = None  # processed hits
623
624         stored = search.searcher.doc(scoreDocs.doc)
625         self.book_id = int(stored.get("book_id"))
626
627         header_type = stored.get("header_type")
628         if not header_type:
629             return
630
631         sec = (header_type, int(stored.get("header_index")))
632         header_span = stored.get('header_span')
633         header_span = header_span is not None and int(header_span) or 1
634
635         fragment = stored.get("fragment_anchor")
636
637         pd = stored.get("published_date")
638         if pd is None:
639             pd = 0
640         self.published_date = int(pd)
641
642         if snippets:
643             snippets = snippets.replace("/\n", "\n")
644         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
645
646         self._hits.append(hit)
647
648         self.search = search
649         self.searched = searched
650         self.tokens_cache = tokens_cache
651
652     @property
653     def score(self):
654         return self._score * self.boost
655
656     def merge(self, other):
657         if self.book_id != other.book_id:
658             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
659         self._hits += other._hits
660         if other.score > self.score:
661             self._score = other._score
662         return self
663
664     def get_book(self):
665         return catalogue.models.Book.objects.get(id=self.book_id)
666
667     book = property(get_book)
668
669     @property
670     def hits(self):
671         if self._processed_hits is not None:
672             return self._processed_hits
673
674         POSITION = 0
675         FRAGMENT = 1
676         POSITION_INDEX = 1
677         POSITION_SPAN = 2
678         SCORE = 2
679         OTHER = 3
680
681         # to sections and fragments
682         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
683         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
684         sect = filter(lambda s: 0 == len(filter(
685             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
686             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
687             frags)), sect)
688
689         hits = []
690
691         # remove duplicate fragments
692         fragments = {}
693         for f in frags:
694             fid = f[FRAGMENT]
695             if fid in fragments:
696                 if fragments[fid][SCORE] >= f[SCORE]:
697                     continue
698             fragments[fid] = f
699         frags = fragments.values()
700
701         # remove duplicate sections
702         sections = {}
703
704         for s in sect:
705             si = s[POSITION][POSITION_INDEX]
706             # skip existing
707             if si in sections:
708                 if sections[si]['score'] >= s[SCORE]:
709                     continue
710
711             m = {'score': s[SCORE],
712                  'section_number': s[POSITION][POSITION_INDEX] + 1,
713                  }
714             m.update(s[OTHER])
715             sections[si] = m
716
717         hits = sections.values()
718
719         for f in frags:
720             try:
721                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
722             except catalogue.models.Fragment.DoesNotExist:
723                 # stale index
724                 continue
725
726             # Figure out if we were searching for a token matching some word in theme name.
727             themes = frag.tags.filter(category='theme')
728             themes_hit = []
729             if self.searched is not None:
730                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
731                 for theme in themes:
732                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
733                     for t in tokens:
734                         if t in name_tokens:
735                             if not theme in themes_hit:
736                                 themes_hit.append(theme)
737                             break
738
739             m = {'score': f[SCORE],
740                  'fragment': frag,
741                  'section_number': f[POSITION][POSITION_INDEX] + 1,
742                  'themes': themes,
743                  'themes_hit': themes_hit
744                  }
745             m.update(f[OTHER])
746             hits.append(m)
747
748         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
749
750         self._processed_hits = hits
751
752         return hits
753
754     def __unicode__(self):
755         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
756
757     @staticmethod
758     def aggregate(*result_lists):
759         books = {}
760         for rl in result_lists:
761             for r in rl:
762                 if r.book_id in books:
763                     books[r.book_id].merge(r)
764                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
765                 else:
766                     books[r.book_id] = r
767         return books.values()
768
769     def __cmp__(self, other):
770         c = cmp(self.score, other.score)
771         if c == 0:
772             # this is inverted, because earlier date is better
773             return cmp(other.published_date, self.published_date)
774         else:
775             return c
776
777
778 class Hint(object):
779     """
780     Given some hint information (information we already know about)
781     our search target - like author, title (specific book), epoch, genre, kind
782     we can narrow down search using filters.
783     """
784     def __init__(self, search):
785         """
786         Accepts a Searcher instance.
787         """
788         self.search = search
789         self.book_tags = {}
790         self.part_tags = []
791         self._books = []
792
793     def books(self, *books):
794         """
795         Give a hint that we search these books.
796         """
797         self._books = books
798
799     def tags(self, tags):
800         """
801         Give a hint that these Tag objects (a list of)
802         is necessary.
803         """
804         for t in tags:
805             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
806                 lst = self.book_tags.get(t.category, [])
807                 lst.append(t)
808                 self.book_tags[t.category] = lst
809             if t.category in ['theme', 'theme_pl']:
810                 self.part_tags.append(t)
811
812     def tag_filter(self, tags, field='tags'):
813         """
814         Given a lsit of tags and an optional field (but they are normally in tags field)
815         returns a filter accepting only books with specific tags.
816         """
817         q = BooleanQuery()
818
819         for tag in tags:
820             toks = self.search.get_tokens(tag.name, field=field)
821             tag_phrase = PhraseQuery()
822             for tok in toks:
823                 tag_phrase.add(Term(field, tok))
824             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
825
826         return QueryWrapperFilter(q)
827
828     def book_filter(self):
829         """
830         Filters using book tags (all tag kinds except a theme)
831         """
832         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
833         if tags:
834             return self.tag_filter(tags)
835         else:
836             return None
837
838     def part_filter(self):
839         """
840         This filter can be used to look for book parts.
841         It filters on book id and/or themes.
842         """
843         fs = []
844         if self.part_tags:
845             fs.append(self.tag_filter(self.part_tags, field='themes'))
846
847         if self._books != []:
848             bf = BooleanFilter()
849             for b in self._books:
850                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
851                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
852             fs.append(bf)
853
854         return Search.chain_filters(fs)
855
856     def should_search_for_book(self):
857         return self._books == []
858
859     def just_search_in(self, all):
860         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
861         some = []
862         for field in all:
863             if field == 'authors' and 'author' in self.book_tags:
864                 continue
865             if field == 'title' and self._books != []:
866                 continue
867             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
868                 continue
869             some.append(field)
870         return some
871
872
873 class Search(IndexStore):
874     """
875     Search facilities.
876     """
877     def __init__(self, default_field="content"):
878         IndexStore.__init__(self)
879         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
880         # self.analyzer = WLAnalyzer()
881         self.searcher = IndexSearcher(self.store, True)
882         self.parser = QueryParser(Version.LUCENE_34, default_field,
883                                   self.analyzer)
884
885         self.parent_filter = TermsFilter()
886         self.parent_filter.addTerm(Term("is_book", "true"))
887
888     def query(self, query):
889         """Parse query in default Lucene Syntax. (for humans)
890         """
891         return self.parser.parse(query)
892
893     def simple_search(self, query, max_results=50):
894         """Runs a query for books using lucene syntax. (for humans)
895         Returns (books, total_hits)
896         """
897
898         tops = self.searcher.search(self.query(query), max_results)
899         bks = []
900         for found in tops.scoreDocs:
901             doc = self.searcher.doc(found.doc)
902             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
903         return (bks, tops.totalHits)
904
905     def get_tokens(self, searched, field='content', cached=None):
906         """returns tokens analyzed by a proper (for a field) analyzer
907         argument can be: StringReader, string/unicode, or tokens. In the last case
908         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
909         """
910         if cached is not None and field in cached:
911             return cached[field]
912
913         if isinstance(searched, str) or isinstance(searched, unicode):
914             searched = StringReader(searched)
915         elif isinstance(searched, list):
916             return searched
917
918         searched.reset()
919         tokens = self.analyzer.reusableTokenStream(field, searched)
920         toks = []
921         while tokens.incrementToken():
922             cta = tokens.getAttribute(CharTermAttribute.class_)
923             toks.append(cta.toString())
924
925         if cached is not None:
926             cached[field] = toks
927
928         return toks
929
930     def fuzziness(self, fuzzy):
931         """Helper method to sanitize fuzziness"""
932         if not fuzzy:
933             return None
934         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
935             return fuzzy
936         else:
937             return 0.5
938
939     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
940         """
941         Return a PhraseQuery with a series of tokens.
942         """
943         if fuzzy:
944             phrase = MultiPhraseQuery()
945             for t in tokens:
946                 term = Term(field, t)
947                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
948                 fuzzterms = []
949
950                 while True:
951                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
952                     ft = fuzzterm.term()
953                     if ft:
954                         fuzzterms.append(ft)
955                     if not fuzzterm.next(): break
956                 if fuzzterms:
957                     phrase.add(JArray('object')(fuzzterms, Term))
958                 else:
959                     phrase.add(term)
960         else:
961             phrase = PhraseQuery()
962             phrase.setSlop(slop)
963             for t in tokens:
964                 term = Term(field, t)
965                 phrase.add(term)
966         return phrase
967
968     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
969         """
970         Returns term queries joined by boolean query.
971         modal - applies to boolean query
972         fuzzy - should the query by fuzzy.
973         """
974         q = BooleanQuery()
975         for t in tokens:
976             term = Term(field, t)
977             if fuzzy:
978                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
979             else:
980                 term = TermQuery(term)
981             q.add(BooleanClause(term, modal))
982         return q
983
984     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
985                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
986         if filters is None: filters = []
987         if tokens_cache is None: tokens_cache = {}
988
989         tokens = self.get_tokens(searched, field, cached=tokens_cache)
990
991         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
992         if book:
993             filters.append(self.term_filter(Term('is_book', 'true')))
994         top = self.searcher.search(query, self.chain_filters(filters), max_results)
995
996         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
997
998     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
999                     filters=None, tokens_cache=None, boost=None, snippets=True):
1000         if filters is None: filters = []
1001         if tokens_cache is None: tokens_cache = {}
1002
1003         if book:
1004             filters.append(self.term_filter(Term('is_book', 'true')))
1005
1006         query = BooleanQuery()
1007
1008         for fld in fields:
1009             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1010
1011             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1012                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1013
1014         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1015
1016         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1017                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1018
1019     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1020         """
1021         Search for perfect book matches. Just see if the query matches with some author or title,
1022         taking hints into account.
1023         """
1024         fields_to_search = ['authors', 'title']
1025         only_in = None
1026         if hint:
1027             if not hint.should_search_for_book():
1028                 return []
1029             fields_to_search = hint.just_search_in(fields_to_search)
1030             only_in = hint.book_filter()
1031
1032         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1033
1034         books = []
1035         for q in qrys:
1036             top = self.searcher.search(q,
1037                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1038                 max_results)
1039             for found in top.scoreDocs:
1040                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1041         return books
1042
1043     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1044         fields_to_search = ['tags', 'authors', 'title']
1045
1046         only_in = None
1047         if hint:
1048             if not hint.should_search_for_book():
1049                 return []
1050             fields_to_search = hint.just_search_in(fields_to_search)
1051             only_in = hint.book_filter()
1052
1053         tokens = self.get_tokens(searched, field='SIMPLE')
1054
1055         q = BooleanQuery()
1056
1057         for fld in fields_to_search:
1058             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1059                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1060
1061         books = []
1062         top = self.searcher.search(q,
1063                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1064             max_results)
1065         for found in top.scoreDocs:
1066             books.append(SearchResult(self, found, how_found="search_book"))
1067
1068         return books
1069
1070     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1071         """
1072         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1073         some part/fragment of the book.
1074         """
1075         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1076
1077         flt = None
1078         if hint:
1079             flt = hint.part_filter()
1080
1081         books = []
1082         for q in qrys:
1083             top = self.searcher.search(q,
1084                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1085                                                            flt]),
1086                                        max_results)
1087             for found in top.scoreDocs:
1088                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1089
1090         return books
1091
1092     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1093         """
1094         Tries to use search terms to match different fields of book (or its parts).
1095         E.g. one word can be an author survey, another be a part of the title, and the rest
1096         are some words from third chapter.
1097         """
1098         if tokens_cache is None: tokens_cache = {}
1099         books = []
1100         only_in = None
1101
1102         if hint:
1103             only_in = hint.part_filter()
1104
1105         # content only query : themes x content
1106         q = BooleanQuery()
1107
1108         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1109         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1110
1111         # only search in themes when we do not already filter by themes
1112         if hint is None or hint.just_search_in(['themes']) != []:
1113             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1114                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1115
1116         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1117                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1118
1119         topDocs = self.searcher.search(q, only_in, max_results)
1120         for found in topDocs.scoreDocs:
1121             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1122             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1123
1124         # query themes/content x author/title/tags
1125         q = BooleanQuery()
1126         in_content = BooleanQuery()
1127         in_meta = BooleanQuery()
1128
1129         for fld in ['themes_pl', 'content']:
1130             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1131
1132         for fld in ['tags', 'authors', 'title']:
1133             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1134
1135         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1136         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1137
1138         topDocs = self.searcher.search(q, only_in, max_results)
1139         for found in topDocs.scoreDocs:
1140             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1141             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1142
1143         return books
1144
1145     # def multisearch(self, query, max_results=50):
1146     #     """
1147     #     Search strategy:
1148     #     - (phrase) OR -> content
1149     #                   -> title
1150     #                   -> authors
1151     #     - (keywords)  -> authors
1152     #                   -> motyw
1153     #                   -> tags
1154     #                   -> content
1155     #     """
1156         # queryreader = StringReader(query)
1157         # tokens = self.get_tokens(queryreader)
1158
1159         # top_level = BooleanQuery()
1160         # Should = BooleanClause.Occur.SHOULD
1161
1162         # phrase_level = BooleanQuery()
1163         # phrase_level.setBoost(1.3)
1164
1165         # p_content = self.make_phrase(tokens, joined=True)
1166         # p_title = self.make_phrase(tokens, 'title')
1167         # p_author = self.make_phrase(tokens, 'author')
1168
1169         # phrase_level.add(BooleanClause(p_content, Should))
1170         # phrase_level.add(BooleanClause(p_title, Should))
1171         # phrase_level.add(BooleanClause(p_author, Should))
1172
1173         # kw_level = BooleanQuery()
1174
1175         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1176         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1177         # kw_level.add(j_themes, Should)
1178         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1179         # j_con = self.make_term_query(tokens, joined=True)
1180         # kw_level.add(j_con, Should)
1181
1182         # top_level.add(BooleanClause(phrase_level, Should))
1183         # top_level.add(BooleanClause(kw_level, Should))
1184
1185         # return None
1186
1187     def get_snippets(self, scoreDoc, query, field='content'):
1188         """
1189         Returns a snippet for found scoreDoc.
1190         """
1191         htmlFormatter = SimpleHTMLFormatter()
1192         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1193
1194         stored = self.searcher.doc(scoreDoc.doc)
1195
1196         position = stored.get('snippets_position')
1197         length = stored.get('snippets_length')
1198         if position is None or length is None:
1199             return None
1200         # locate content.
1201         snippets = Snippets(stored.get('book_id')).open()
1202         try:
1203             text = snippets.get((int(position),
1204                                  int(length)))
1205         finally:
1206             snippets.close()
1207
1208         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1209         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1210         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1211
1212         return snip
1213
1214     @staticmethod
1215     def enum_to_array(enum):
1216         """
1217         Converts a lucene TermEnum to array of Terms, suitable for
1218         addition to queries
1219         """
1220         terms = []
1221
1222         while True:
1223             t = enum.term()
1224             if t:
1225                 terms.append(t)
1226             if not enum.next(): break
1227
1228         if terms:
1229             return JArray('object')(terms, Term)
1230
1231     def search_tags(self, query, filter=None, max_results=40):
1232         """
1233         Search for Tag objects using query.
1234         """
1235         tops = self.searcher.search(query, filter, max_results)
1236
1237         tags = []
1238         for found in tops.scoreDocs:
1239             doc = self.searcher.doc(found.doc)
1240             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1241             tags.append(tag)
1242             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1243
1244         return tags
1245
1246     def search_books(self, query, filter=None, max_results=10):
1247         """
1248         Searches for Book objects using query
1249         """
1250         bks = []
1251         tops = self.searcher.search(query, filter, max_results)
1252         for found in tops.scoreDocs:
1253             doc = self.searcher.doc(found.doc)
1254             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1255         return bks
1256
1257     def create_prefix_phrase(self, toks, field):
1258         q = MultiPhraseQuery()
1259         for i in range(len(toks)):
1260             t = Term(field, toks[i])
1261             if i == len(toks) - 1:
1262                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1263                 if pterms:
1264                     q.add(pterms)
1265                 else:
1266                     q.add(t)
1267             else:
1268                 q.add(t)
1269         return q
1270
1271     @staticmethod
1272     def term_filter(term, inverse=False):
1273         only_term = TermsFilter()
1274         only_term.addTerm(term)
1275
1276         if inverse:
1277             neg = BooleanFilter()
1278             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1279             only_term = neg
1280
1281         return only_term
1282
1283     def hint_tags(self, string, max_results=50):
1284         """
1285         Return auto-complete hints for tags
1286         using prefix search.
1287         """
1288         toks = self.get_tokens(string, field='SIMPLE')
1289         top = BooleanQuery()
1290
1291         for field in ['tag_name', 'tag_name_pl']:
1292             q = self.create_prefix_phrase(toks, field)
1293             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1294
1295         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1296
1297         return self.search_tags(top, no_book_cat, max_results=max_results)
1298
1299     def hint_books(self, string, max_results=50):
1300         """
1301         Returns auto-complete hints for book titles
1302         Because we do not index 'pseudo' title-tags.
1303         Prefix search.
1304         """
1305         toks = self.get_tokens(string, field='SIMPLE')
1306
1307         q = self.create_prefix_phrase(toks, 'title')
1308
1309         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1310
1311     @staticmethod
1312     def chain_filters(filters, op=ChainedFilter.AND):
1313         """
1314         Chains a filter list together
1315         """
1316         filters = filter(lambda x: x is not None, filters)
1317         if not filters or filters is []:
1318             return None
1319         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1320         return chf
1321
1322     def filtered_categories(self, tags):
1323         """
1324         Return a list of tag categories, present in tags list.
1325         """
1326         cats = {}
1327         for t in tags:
1328             cats[t.category] = True
1329         return cats.keys()
1330
1331     def hint(self):
1332         return Hint(self)