remove set_trace
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, analyzer=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         self.index = IndexWriter(self.store, self.analyzer,\
176                                  IndexWriter.MaxFieldLength.LIMITED)
177         return self.index
178
179     def optimize(self):
180         self.index.optimize()
181
182     def close(self):
183         try:
184             self.index.optimize()
185         except JavaError, je:
186             print "Error during optimize phase, check index: %s" % je
187
188         self.index.close()
189         self.index = None
190
191     def __enter__(self):
192         self.open()
193         return self
194
195     def __exit__(self, type, value, tb):
196         self.close()
197
198
199 class Index(BaseIndex):
200     """
201     Class indexing books.
202     """
203     def __init__(self, analyzer=None):
204         super(Index, self).__init__(analyzer)
205
206     def index_tags(self):
207         """
208         Re-index global tag list.
209         Removes all tags from index, then index them again.
210         Indexed fields include: id, name (with and without polish stems), category
211         """
212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213         self.index.deleteDocuments(q)
214
215         for tag in catalogue.models.Tag.objects.all():
216             doc = Document()
217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221             self.index.addDocument(doc)
222
223         for pdtag in PDCounterAuthor.objects.all():
224             doc = Document()
225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230             self.index.addDocument(doc)
231
232     def create_book_doc(self, book):
233         """
234         Create a lucene document referring book id.
235         """
236         doc = Document()
237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238         if book.parent is not None:
239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
240         return doc
241
242     def remove_book(self, book):
243         """Removes a book from search index.
244         book - Book instance."""
245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246         self.index.deleteDocuments(q)
247
248     def index_book(self, book, book_info=None, overwrite=True):
249         """
250         Indexes the book.
251         Creates a lucene document for extracted metadata
252         and calls self.index_content() to index the contents of the book.
253         """
254         if overwrite:
255             self.remove_book(book)
256
257         book_doc = self.create_book_doc(book)
258         meta_fields = self.extract_metadata(book, book_info)
259         for f in meta_fields.values():
260             if isinstance(f, list) or isinstance(f, tuple):
261                 for elem in f:
262                     book_doc.add(elem)
263             else:
264                 book_doc.add(f)
265
266         self.index.addDocument(book_doc)
267         del book_doc
268
269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
270
271     master_tags = [
272         'opowiadanie',
273         'powiesc',
274         'dramat_wierszowany_l',
275         'dramat_wierszowany_lp',
276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
277         'wywiad',
278         ]
279
280     ignore_content_tags = [
281         'uwaga', 'extra',
282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
283         'didaskalia',
284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
285         ]
286
287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
288
289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
290
291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
292
293     def extract_metadata(self, book, book_info=None):
294         """
295         Extract metadata from book and returns a map of fields keyed by fieldname
296         """
297         fields = {}
298
299         if book_info is None:
300             book_info = dcparser.parse(open(book.xml_file.path))
301
302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
305
306         # validator, name
307         for field in dcparser.BookInfo.FIELDS:
308             if hasattr(book_info, field.name):
309                 if not getattr(book_info, field.name):
310                     continue
311                 # since no type information is available, we use validator
312                 type_indicator = field.validator
313                 if type_indicator == dcparser.as_unicode:
314                     s = getattr(book_info, field.name)
315                     if field.multiple:
316                         s = ', '.join(s)
317                     try:
318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319                     except JavaError as je:
320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321                 elif type_indicator == dcparser.as_person:
322                     p = getattr(book_info, field.name)
323                     if isinstance(p, dcparser.Person):
324                         persons = unicode(p)
325                     else:
326                         persons = ', '.join(map(unicode, p))
327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328                 elif type_indicator == dcparser.as_date:
329                     dt = getattr(book_info, field.name)
330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
332
333         # get published date
334         source = book_info.source_name
335         if hasattr(book_info, 'source_name'):
336             match = self.published_date_re.search(source)
337             if match is not None:
338                 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
339
340         return fields
341
342     def add_gaps(self, fields, fieldname):
343         """
344         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
345         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
346         """
347         def gap():
348             while True:
349                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
350         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
351
352     def get_master(self, root):
353         """
354         Returns the first master tag from an etree.
355         """
356         for master in root.iter():
357             if master.tag in self.master_tags:
358                 return master
359
360     def index_content(self, book, book_fields=[]):
361         """
362         Walks the book XML and extract content from it.
363         Adds parts for each header tag and for each fragment.
364         """
365         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
366         root = wld.edoc.getroot()
367
368         master = self.get_master(root)
369         if master is None:
370             return []
371
372         def walker(node, ignore_tags=[]):
373             yield node, None
374             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
375                 for b, e in walker(child):
376                     yield b, e
377             yield None, node
378             return
379
380         def fix_format(text):
381             #            separator = [u" ", u"\t", u".", u";", u","]
382             if isinstance(text, list):
383                 # need to join it first
384                 text = filter(lambda s: s is not None, content)
385                 text = u' '.join(text)
386                 # for i in range(len(text)):
387                 #     if i > 0:
388                 #         if text[i][0] not in separator\
389                 #             and text[i - 1][-1] not in separator:
390                 #          text.insert(i, u" ")
391
392             return re.sub("(?m)/$", "", text)
393
394         def add_part(snippets, **fields):
395             doc = self.create_book_doc(book)
396             for f in book_fields:
397                 doc.add(f)
398
399             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
400             doc.add(NumericField("header_span", Field.Store.YES, True)\
401                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
402             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
403
404             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
405                           Field.TermVector.WITH_POSITIONS_OFFSETS))
406
407             snip_pos = snippets.add(fields["content"])
408             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
409             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
410
411             if 'fragment_anchor' in fields:
412                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
413                               Field.Store.YES, Field.Index.NOT_ANALYZED))
414
415             if 'themes' in fields:
416                 themes, themes_pl = zip(*[
417                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
418                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
419                      for theme in fields['themes']])
420
421                 themes = self.add_gaps(themes, 'themes')
422                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
423
424                 for t in themes:
425                     doc.add(t)
426                 for t in themes_pl:
427                     doc.add(t)
428
429             return doc
430
431         def give_me_utf8(s):
432             if isinstance(s, unicode):
433                 return s.encode('utf-8')
434             else:
435                 return s
436
437         fragments = {}
438         snippets = Snippets(book.id).open('w')
439         try:
440             for header, position in zip(list(master), range(len(master))):
441
442                 if header.tag in self.skip_header_tags:
443                     continue
444                 if header.tag is etree.Comment:
445                     continue
446
447                 # section content
448                 content = []
449                 footnote = None
450
451                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
452                     # handle footnotes
453                     # if start is not None and start.tag in self.footnote_tags:
454                     #     footnote = ' '.join(start.itertext())
455                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
456                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
457                     #                    content=footnote)
458
459                     #     self.index.addDocument(doc)
460
461                     #     footnote = None
462
463                     # handle fragments and themes.
464                     if start is not None and start.tag == 'begin':
465                         fid = start.attrib['id'][1:]
466                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
467
468                     elif start is not None and start.tag == 'motyw':
469                         fid = start.attrib['id'][1:]
470                         if start.text is not None:
471                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
472
473                     elif start is not None and start.tag == 'end':
474                         fid = start.attrib['id'][1:]
475                         if fid not in fragments:
476                             continue  # a broken <end> node, skip it
477                                       #                        import pdb; pdb.set_trace()
478                         frag = fragments[fid]
479                         if frag['themes'] == []:
480                             continue  # empty themes list.
481                         del fragments[fid]
482
483                         doc = add_part(snippets,
484                                        header_type=frag['start_header'],
485                                        header_index=frag['start_section'],
486                                        header_span=position - frag['start_section'] + 1,
487                                        fragment_anchor=fid,
488                                        content=fix_format(frag['content']),
489                                        themes=frag['themes'])
490
491                         self.index.addDocument(doc)
492
493                         # Collect content.
494                     elif start is not None:
495                         for frag in fragments.values():
496                             frag['content'].append(start.text)
497                         content.append(start.text)
498                     elif end is not None:
499                         for frag in fragments.values():
500                             frag['content'].append(end.tail)
501                         content.append(end.tail)
502
503                         # in the end, add a section text.
504                 doc = add_part(snippets, header_index=position, header_type=header.tag,
505                                content=fix_format(content))
506
507                 self.index.addDocument(doc)
508
509         finally:
510             snippets.close()
511
512
513 def log_exception_wrapper(f):
514     def _wrap(*a):
515         try:
516             f(*a)
517         except Exception, e:
518             print("Error in indexing thread: %s" % e)
519             traceback.print_exc()
520             raise e
521     return _wrap
522
523
524 class ReusableIndex(Index):
525     """
526     Works like index, but does not close/optimize Lucene index
527     until program exit (uses atexit hook).
528     This is usefull for importbooks command.
529
530     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
531     """
532     index = None
533
534     def open(self, analyzer=None, threads=4):
535         if ReusableIndex.index is not None:
536             self.index = ReusableIndex.index
537         else:
538             print("opening index")
539             Index.open(self, analyzer)
540             ReusableIndex.index = self.index
541             atexit.register(ReusableIndex.close_reusable)
542
543     # def index_book(self, *args, **kw):
544     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
545     #     ReusableIndex.pool_jobs.append(job)
546
547     @staticmethod
548     def close_reusable():
549         if ReusableIndex.index is not None:
550             ReusableIndex.index.optimize()
551             ReusableIndex.index.close()
552             ReusableIndex.index = None
553
554     def close(self):
555         pass
556
557
558 class JoinSearch(object):
559     """
560     This mixin could be used to handle block join queries.
561     (currently unused)
562     """
563     def __init__(self, *args, **kw):
564         super(JoinSearch, self).__init__(*args, **kw)
565
566     def wrapjoins(self, query, fields=[]):
567         """
568         This functions modifies the query in a recursive way,
569         so Term and Phrase Queries contained, which match
570         provided fields are wrapped in a BlockJoinQuery,
571         and so delegated to children documents.
572         """
573         if BooleanQuery.instance_(query):
574             qs = BooleanQuery.cast_(query)
575             for clause in qs:
576                 clause = BooleanClause.cast_(clause)
577                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
578             return qs
579         else:
580             termset = HashSet()
581             query.extractTerms(termset)
582             for t in termset:
583                 t = Term.cast_(t)
584                 if t.field() not in fields:
585                     return query
586             return BlockJoinQuery(query, self.parent_filter,
587                                   BlockJoinQuery.ScoreMode.Total)
588
589     def bsearch(self, query, max_results=50):
590         q = self.query(query)
591         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
592
593         tops = self.searcher.search(bjq, max_results)
594         bks = []
595         for found in tops.scoreDocs:
596             doc = self.searcher.doc(found.doc)
597             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
598         return (bks, tops.totalHits)
599
600
601 class SearchResult(object):
602     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
603         if tokens_cache is None: tokens_cache = {}
604
605         if score:
606             self._score = score
607         else:
608             self._score = scoreDocs.score
609
610         self.boost = 1.0
611
612         self._hits = []
613         self._processed_hits = None  # processed hits
614
615         stored = search.searcher.doc(scoreDocs.doc)
616         self.book_id = int(stored.get("book_id"))
617
618         pd = stored.get("published_date")
619         if pd is None:
620             pd = 0
621         self.published_date = int(pd)
622
623         header_type = stored.get("header_type")
624         # we have a content hit in some header of fragment
625         if header_type is not None:
626             sec = (header_type, int(stored.get("header_index")))
627             header_span = stored.get('header_span')
628             header_span = header_span is not None and int(header_span) or 1
629
630             fragment = stored.get("fragment_anchor")
631
632             if snippets:
633                 snippets = snippets.replace("/\n", "\n")
634             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
635
636             self._hits.append(hit)
637
638         self.search = search
639         self.searched = searched
640         self.tokens_cache = tokens_cache
641
642     @property
643     def score(self):
644         return self._score * self.boost
645
646     def merge(self, other):
647         if self.book_id != other.book_id:
648             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
649         self._hits += other._hits
650         if other.score > self.score:
651             self._score = other._score
652         return self
653
654     def get_book(self):
655         return catalogue.models.Book.objects.get(id=self.book_id)
656
657     book = property(get_book)
658
659     @property
660     def hits(self):
661         if self._processed_hits is not None:
662             return self._processed_hits
663
664         POSITION = 0
665         FRAGMENT = 1
666         POSITION_INDEX = 1
667         POSITION_SPAN = 2
668         SCORE = 2
669         OTHER = 3
670
671         # to sections and fragments
672         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
673         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
674         sect = filter(lambda s: 0 == len(filter(
675             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
676             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
677             frags)), sect)
678
679         hits = []
680
681         # remove duplicate fragments
682         fragments = {}
683         for f in frags:
684             fid = f[FRAGMENT]
685             if fid in fragments:
686                 if fragments[fid][SCORE] >= f[SCORE]:
687                     continue
688             fragments[fid] = f
689         frags = fragments.values()
690
691         # remove duplicate sections
692         sections = {}
693
694         for s in sect:
695             si = s[POSITION][POSITION_INDEX]
696             # skip existing
697             if si in sections:
698                 if sections[si]['score'] >= s[SCORE]:
699                     continue
700
701             m = {'score': s[SCORE],
702                  'section_number': s[POSITION][POSITION_INDEX] + 1,
703                  }
704             m.update(s[OTHER])
705             sections[si] = m
706
707         hits = sections.values()
708
709         for f in frags:
710             try:
711                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
712             except catalogue.models.Fragment.DoesNotExist:
713                 # stale index
714                 continue
715
716             # Figure out if we were searching for a token matching some word in theme name.
717             themes = frag.tags.filter(category='theme')
718             themes_hit = []
719             if self.searched is not None:
720                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
721                 for theme in themes:
722                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
723                     for t in tokens:
724                         if t in name_tokens:
725                             if not theme in themes_hit:
726                                 themes_hit.append(theme)
727                             break
728
729             m = {'score': f[SCORE],
730                  'fragment': frag,
731                  'section_number': f[POSITION][POSITION_INDEX] + 1,
732                  'themes': themes,
733                  'themes_hit': themes_hit
734                  }
735             m.update(f[OTHER])
736             hits.append(m)
737
738         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
739
740         self._processed_hits = hits
741
742         return hits
743
744     def __unicode__(self):
745         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
746
747     @staticmethod
748     def aggregate(*result_lists):
749         books = {}
750         for rl in result_lists:
751             for r in rl:
752                 if r.book_id in books:
753                     books[r.book_id].merge(r)
754                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
755                 else:
756                     books[r.book_id] = r
757         return books.values()
758
759     def __cmp__(self, other):
760         c = cmp(self.score, other.score)
761         if c == 0:
762             # this is inverted, because earlier date is better
763             return cmp(other.published_date, self.published_date)
764         else:
765             return c
766
767
768 class Hint(object):
769     """
770     Given some hint information (information we already know about)
771     our search target - like author, title (specific book), epoch, genre, kind
772     we can narrow down search using filters.
773     """
774     def __init__(self, search):
775         """
776         Accepts a Searcher instance.
777         """
778         self.search = search
779         self.book_tags = {}
780         self.part_tags = []
781         self._books = []
782
783     def books(self, *books):
784         """
785         Give a hint that we search these books.
786         """
787         self._books = books
788
789     def tags(self, tags):
790         """
791         Give a hint that these Tag objects (a list of)
792         is necessary.
793         """
794         for t in tags:
795             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
796                 lst = self.book_tags.get(t.category, [])
797                 lst.append(t)
798                 self.book_tags[t.category] = lst
799             if t.category in ['theme', 'theme_pl']:
800                 self.part_tags.append(t)
801
802     def tag_filter(self, tags, field='tags'):
803         """
804         Given a lsit of tags and an optional field (but they are normally in tags field)
805         returns a filter accepting only books with specific tags.
806         """
807         q = BooleanQuery()
808
809         for tag in tags:
810             toks = self.search.get_tokens(tag.name, field=field)
811             tag_phrase = PhraseQuery()
812             for tok in toks:
813                 tag_phrase.add(Term(field, tok))
814             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
815
816         return QueryWrapperFilter(q)
817
818     def book_filter(self):
819         """
820         Filters using book tags (all tag kinds except a theme)
821         """
822         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
823         if tags:
824             return self.tag_filter(tags)
825         else:
826             return None
827
828     def part_filter(self):
829         """
830         This filter can be used to look for book parts.
831         It filters on book id and/or themes.
832         """
833         fs = []
834         if self.part_tags:
835             fs.append(self.tag_filter(self.part_tags, field='themes'))
836
837         if self._books != []:
838             bf = BooleanFilter()
839             for b in self._books:
840                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
841                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
842             fs.append(bf)
843
844         return Search.chain_filters(fs)
845
846     def should_search_for_book(self):
847         return self._books == []
848
849     def just_search_in(self, all):
850         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
851         some = []
852         for field in all:
853             if field == 'authors' and 'author' in self.book_tags:
854                 continue
855             if field == 'title' and self._books != []:
856                 continue
857             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
858                 continue
859             some.append(field)
860         return some
861
862
863 class Search(IndexStore):
864     """
865     Search facilities.
866     """
867     def __init__(self, default_field="content"):
868         IndexStore.__init__(self)
869         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
870         # self.analyzer = WLAnalyzer()
871         self.searcher = IndexSearcher(self.store, True)
872         self.parser = QueryParser(Version.LUCENE_34, default_field,
873                                   self.analyzer)
874
875         self.parent_filter = TermsFilter()
876         self.parent_filter.addTerm(Term("is_book", "true"))
877
878     def query(self, query):
879         """Parse query in default Lucene Syntax. (for humans)
880         """
881         return self.parser.parse(query)
882
883     def simple_search(self, query, max_results=50):
884         """Runs a query for books using lucene syntax. (for humans)
885         Returns (books, total_hits)
886         """
887
888         tops = self.searcher.search(self.query(query), max_results)
889         bks = []
890         for found in tops.scoreDocs:
891             doc = self.searcher.doc(found.doc)
892             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
893         return (bks, tops.totalHits)
894
895     def get_tokens(self, searched, field='content', cached=None):
896         """returns tokens analyzed by a proper (for a field) analyzer
897         argument can be: StringReader, string/unicode, or tokens. In the last case
898         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
899         """
900         if cached is not None and field in cached:
901             return cached[field]
902
903         if isinstance(searched, str) or isinstance(searched, unicode):
904             searched = StringReader(searched)
905         elif isinstance(searched, list):
906             return searched
907
908         searched.reset()
909         tokens = self.analyzer.reusableTokenStream(field, searched)
910         toks = []
911         while tokens.incrementToken():
912             cta = tokens.getAttribute(CharTermAttribute.class_)
913             toks.append(cta.toString())
914
915         if cached is not None:
916             cached[field] = toks
917
918         return toks
919
920     def fuzziness(self, fuzzy):
921         """Helper method to sanitize fuzziness"""
922         if not fuzzy:
923             return None
924         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
925             return fuzzy
926         else:
927             return 0.5
928
929     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
930         """
931         Return a PhraseQuery with a series of tokens.
932         """
933         if fuzzy:
934             phrase = MultiPhraseQuery()
935             for t in tokens:
936                 term = Term(field, t)
937                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
938                 fuzzterms = []
939
940                 while True:
941                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
942                     ft = fuzzterm.term()
943                     if ft:
944                         fuzzterms.append(ft)
945                     if not fuzzterm.next(): break
946                 if fuzzterms:
947                     phrase.add(JArray('object')(fuzzterms, Term))
948                 else:
949                     phrase.add(term)
950         else:
951             phrase = PhraseQuery()
952             phrase.setSlop(slop)
953             for t in tokens:
954                 term = Term(field, t)
955                 phrase.add(term)
956         return phrase
957
958     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
959         """
960         Returns term queries joined by boolean query.
961         modal - applies to boolean query
962         fuzzy - should the query by fuzzy.
963         """
964         q = BooleanQuery()
965         for t in tokens:
966             term = Term(field, t)
967             if fuzzy:
968                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
969             else:
970                 term = TermQuery(term)
971             q.add(BooleanClause(term, modal))
972         return q
973
974     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
975                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
976         if filters is None: filters = []
977         if tokens_cache is None: tokens_cache = {}
978
979         tokens = self.get_tokens(searched, field, cached=tokens_cache)
980
981         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
982         if book:
983             filters.append(self.term_filter(Term('is_book', 'true')))
984         top = self.searcher.search(query, self.chain_filters(filters), max_results)
985
986         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
987
988     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
989                     filters=None, tokens_cache=None, boost=None, snippets=True):
990         if filters is None: filters = []
991         if tokens_cache is None: tokens_cache = {}
992
993         if book:
994             filters.append(self.term_filter(Term('is_book', 'true')))
995
996         query = BooleanQuery()
997
998         for fld in fields:
999             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1000
1001             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1002                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1003
1004         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1005
1006         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1007                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1008
1009     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1010         """
1011         Search for perfect book matches. Just see if the query matches with some author or title,
1012         taking hints into account.
1013         """
1014         fields_to_search = ['authors', 'title']
1015         only_in = None
1016         if hint:
1017             if not hint.should_search_for_book():
1018                 return []
1019             fields_to_search = hint.just_search_in(fields_to_search)
1020             only_in = hint.book_filter()
1021
1022         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1023
1024         books = []
1025         for q in qrys:
1026             top = self.searcher.search(q,
1027                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1028                 max_results)
1029             for found in top.scoreDocs:
1030                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1031         return books
1032
1033     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1034         fields_to_search = ['tags', 'authors', 'title']
1035
1036         only_in = None
1037         if hint:
1038             if not hint.should_search_for_book():
1039                 return []
1040             fields_to_search = hint.just_search_in(fields_to_search)
1041             only_in = hint.book_filter()
1042
1043         tokens = self.get_tokens(searched, field='SIMPLE')
1044
1045         q = BooleanQuery()
1046
1047         for fld in fields_to_search:
1048             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1049                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1050
1051         books = []
1052         top = self.searcher.search(q,
1053                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1054             max_results)
1055         for found in top.scoreDocs:
1056             books.append(SearchResult(self, found, how_found="search_book"))
1057
1058         return books
1059
1060     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1061         """
1062         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1063         some part/fragment of the book.
1064         """
1065         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1066
1067         flt = None
1068         if hint:
1069             flt = hint.part_filter()
1070
1071         books = []
1072         for q in qrys:
1073             top = self.searcher.search(q,
1074                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1075                                                            flt]),
1076                                        max_results)
1077             for found in top.scoreDocs:
1078                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1079
1080         return books
1081
1082     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1083         """
1084         Tries to use search terms to match different fields of book (or its parts).
1085         E.g. one word can be an author survey, another be a part of the title, and the rest
1086         are some words from third chapter.
1087         """
1088         if tokens_cache is None: tokens_cache = {}
1089         books = []
1090         only_in = None
1091
1092         if hint:
1093             only_in = hint.part_filter()
1094
1095         # content only query : themes x content
1096         q = BooleanQuery()
1097
1098         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1099         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1100
1101         # only search in themes when we do not already filter by themes
1102         if hint is None or hint.just_search_in(['themes']) != []:
1103             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1104                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1105
1106         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1107                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1108
1109         topDocs = self.searcher.search(q, only_in, max_results)
1110         for found in topDocs.scoreDocs:
1111             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1112             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1113
1114         # query themes/content x author/title/tags
1115         q = BooleanQuery()
1116         in_content = BooleanQuery()
1117         in_meta = BooleanQuery()
1118
1119         for fld in ['themes_pl', 'content']:
1120             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1121
1122         for fld in ['tags', 'authors', 'title']:
1123             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1124
1125         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1126         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1127
1128         topDocs = self.searcher.search(q, only_in, max_results)
1129         for found in topDocs.scoreDocs:
1130             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1131             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1132
1133         return books
1134
1135     # def multisearch(self, query, max_results=50):
1136     #     """
1137     #     Search strategy:
1138     #     - (phrase) OR -> content
1139     #                   -> title
1140     #                   -> authors
1141     #     - (keywords)  -> authors
1142     #                   -> motyw
1143     #                   -> tags
1144     #                   -> content
1145     #     """
1146         # queryreader = StringReader(query)
1147         # tokens = self.get_tokens(queryreader)
1148
1149         # top_level = BooleanQuery()
1150         # Should = BooleanClause.Occur.SHOULD
1151
1152         # phrase_level = BooleanQuery()
1153         # phrase_level.setBoost(1.3)
1154
1155         # p_content = self.make_phrase(tokens, joined=True)
1156         # p_title = self.make_phrase(tokens, 'title')
1157         # p_author = self.make_phrase(tokens, 'author')
1158
1159         # phrase_level.add(BooleanClause(p_content, Should))
1160         # phrase_level.add(BooleanClause(p_title, Should))
1161         # phrase_level.add(BooleanClause(p_author, Should))
1162
1163         # kw_level = BooleanQuery()
1164
1165         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1166         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1167         # kw_level.add(j_themes, Should)
1168         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1169         # j_con = self.make_term_query(tokens, joined=True)
1170         # kw_level.add(j_con, Should)
1171
1172         # top_level.add(BooleanClause(phrase_level, Should))
1173         # top_level.add(BooleanClause(kw_level, Should))
1174
1175         # return None
1176
1177     def get_snippets(self, scoreDoc, query, field='content'):
1178         """
1179         Returns a snippet for found scoreDoc.
1180         """
1181         htmlFormatter = SimpleHTMLFormatter()
1182         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1183
1184         stored = self.searcher.doc(scoreDoc.doc)
1185
1186         position = stored.get('snippets_position')
1187         length = stored.get('snippets_length')
1188         if position is None or length is None:
1189             return None
1190         # locate content.
1191         snippets = Snippets(stored.get('book_id')).open()
1192         try:
1193             text = snippets.get((int(position),
1194                                  int(length)))
1195         finally:
1196             snippets.close()
1197
1198         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1199         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1200         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1201
1202         return snip
1203
1204     @staticmethod
1205     def enum_to_array(enum):
1206         """
1207         Converts a lucene TermEnum to array of Terms, suitable for
1208         addition to queries
1209         """
1210         terms = []
1211
1212         while True:
1213             t = enum.term()
1214             if t:
1215                 terms.append(t)
1216             if not enum.next(): break
1217
1218         if terms:
1219             return JArray('object')(terms, Term)
1220
1221     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1222         """
1223         Search for Tag objects using query.
1224         """
1225         if not pdcounter:
1226             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1227         tops = self.searcher.search(query, filters, max_results)
1228
1229         tags = []
1230         for found in tops.scoreDocs:
1231             doc = self.searcher.doc(found.doc)
1232             is_pdcounter = doc.get('is_pdcounter')
1233             if is_pdcounter:
1234                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1235             else:
1236                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1237                 # don't add the pdcounter tag if same tag already exists
1238             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1239                 tags.append(tag)
1240                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1241         print 'returning %s' % tags
1242         return tags
1243
1244     def search_books(self, query, filter=None, max_results=10):
1245         """
1246         Searches for Book objects using query
1247         """
1248         bks = []
1249         tops = self.searcher.search(query, filter, max_results)
1250         for found in tops.scoreDocs:
1251             doc = self.searcher.doc(found.doc)
1252             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1253         return bks
1254
1255     def make_prefix_phrase(self, toks, field):
1256         q = MultiPhraseQuery()
1257         for i in range(len(toks)):
1258             t = Term(field, toks[i])
1259             if i == len(toks) - 1:
1260                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1261                 if pterms:
1262                     q.add(pterms)
1263                 else:
1264                     q.add(t)
1265             else:
1266                 q.add(t)
1267         return q
1268
1269     @staticmethod
1270     def term_filter(term, inverse=False):
1271         only_term = TermsFilter()
1272         only_term.addTerm(term)
1273
1274         if inverse:
1275             neg = BooleanFilter()
1276             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1277             only_term = neg
1278
1279         return only_term
1280
1281     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1282         """
1283         Return auto-complete hints for tags
1284         using prefix search.
1285         """
1286         toks = self.get_tokens(string, field='SIMPLE')
1287         top = BooleanQuery()
1288
1289         for field in ['tag_name', 'tag_name_pl']:
1290             if prefix:
1291                 q = self.make_prefix_phrase(toks, field)
1292             else:
1293                 q = self.make_term_query(toks, field)
1294             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1295
1296         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1297
1298         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1299
1300     def hint_books(self, string, max_results=50, prefix=True):
1301         """
1302         Returns auto-complete hints for book titles
1303         Because we do not index 'pseudo' title-tags.
1304         Prefix search.
1305         """
1306         toks = self.get_tokens(string, field='SIMPLE')
1307
1308         if prefix:
1309             q = self.make_prefix_phrase(toks, 'title')
1310         else:
1311             q = self.make_term_query(toks, 'title')
1312
1313         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1314
1315     @staticmethod
1316     def chain_filters(filters, op=ChainedFilter.AND):
1317         """
1318         Chains a filter list together
1319         """
1320         filters = filter(lambda x: x is not None, filters)
1321         if not filters or filters is []:
1322             return None
1323         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1324         return chf
1325
1326     def filtered_categories(self, tags):
1327         """
1328         Return a list of tag categories, present in tags list.
1329         """
1330         cats = {}
1331         for t in tags:
1332             cats[t.category] = True
1333         return cats.keys()
1334
1335     def hint(self):
1336         return Hint(self)