cover change
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, analyzer=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         self.index = IndexWriter(self.store, self.analyzer,\
176                                  IndexWriter.MaxFieldLength.LIMITED)
177         return self.index
178
179     def optimize(self):
180         self.index.optimize()
181
182     def close(self):
183         try:
184             self.index.optimize()
185         except JavaError, je:
186             print "Error during optimize phase, check index: %s" % je
187
188         self.index.close()
189         self.index = None
190
191     def __enter__(self):
192         self.open()
193         return self
194
195     def __exit__(self, type, value, tb):
196         self.close()
197
198
199 class Index(BaseIndex):
200     """
201     Class indexing books.
202     """
203     def __init__(self, analyzer=None):
204         super(Index, self).__init__(analyzer)
205
206     def index_tags(self):
207         """
208         Re-index global tag list.
209         Removes all tags from index, then index them again.
210         Indexed fields include: id, name (with and without polish stems), category
211         """
212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213         self.index.deleteDocuments(q)
214
215         for tag in catalogue.models.Tag.objects.all():
216             doc = Document()
217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221             self.index.addDocument(doc)
222
223         for pdtag in PDCounterAuthor.objects.all():
224             doc = Document()
225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230             self.index.addDocument(doc)
231
232     def create_book_doc(self, book):
233         """
234         Create a lucene document referring book id.
235         """
236         doc = Document()
237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238         if book.parent is not None:
239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
240         return doc
241
242     def remove_book(self, book):
243         """Removes a book from search index.
244         book - Book instance."""
245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246         self.index.deleteDocuments(q)
247
248     def index_book(self, book, book_info=None, overwrite=True):
249         """
250         Indexes the book.
251         Creates a lucene document for extracted metadata
252         and calls self.index_content() to index the contents of the book.
253         """
254         if overwrite:
255             self.remove_book(book)
256
257         book_doc = self.create_book_doc(book)
258         meta_fields = self.extract_metadata(book, book_info)
259         for f in meta_fields.values():
260             if isinstance(f, list) or isinstance(f, tuple):
261                 for elem in f:
262                     book_doc.add(elem)
263             else:
264                 book_doc.add(f)
265
266         self.index.addDocument(book_doc)
267         del book_doc
268
269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
270
271     master_tags = [
272         'opowiadanie',
273         'powiesc',
274         'dramat_wierszowany_l',
275         'dramat_wierszowany_lp',
276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
277         'wywiad',
278         ]
279
280     ignore_content_tags = [
281         'uwaga', 'extra',
282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
283         'didaskalia',
284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
285         ]
286
287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
288
289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
290
291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
292
293     def extract_metadata(self, book, book_info=None):
294         """
295         Extract metadata from book and returns a map of fields keyed by fieldname
296         """
297         fields = {}
298
299         if book_info is None:
300             book_info = dcparser.parse(open(book.xml_file.path))
301
302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
305
306         # validator, name
307         for field in dcparser.BookInfo.FIELDS:
308             if hasattr(book_info, field.name):
309                 if not getattr(book_info, field.name):
310                     continue
311                 # since no type information is available, we use validator
312                 type_indicator = field.validator
313                 if type_indicator == dcparser.as_unicode:
314                     s = getattr(book_info, field.name)
315                     if field.multiple:
316                         s = ', '.join(s)
317                     try:
318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319                     except JavaError as je:
320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321                 elif type_indicator == dcparser.as_person:
322                     p = getattr(book_info, field.name)
323                     if isinstance(p, dcparser.Person):
324                         persons = unicode(p)
325                     else:
326                         persons = ', '.join(map(unicode, p))
327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328                 elif type_indicator == dcparser.as_date:
329                     dt = getattr(book_info, field.name)
330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
332
333         # get published date
334         pd = None
335         if hasattr(book_info, 'source_name') and book_info.source_name:
336             match = self.published_date_re.search(book_info.source_name)
337             if match is not None:
338                 pd = str(match.groups()[0])
339         if not pd: pd = ""
340         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
341
342         return fields
343
344     def add_gaps(self, fields, fieldname):
345         """
346         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
347         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
348         """
349         def gap():
350             while True:
351                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
352         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
353
354     def get_master(self, root):
355         """
356         Returns the first master tag from an etree.
357         """
358         for master in root.iter():
359             if master.tag in self.master_tags:
360                 return master
361
362     def index_content(self, book, book_fields=[]):
363         """
364         Walks the book XML and extract content from it.
365         Adds parts for each header tag and for each fragment.
366         """
367         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
368         root = wld.edoc.getroot()
369
370         master = self.get_master(root)
371         if master is None:
372             return []
373
374         def walker(node, ignore_tags=[]):
375
376             if node.tag not in ignore_tags:
377                 yield node, None, None
378                 if node.text is not None:
379                     yield None, node.text, None
380                 for child in list(node):
381                     for b, t, e in walker(child):
382                         yield b, t, e
383                 yield None, None, node
384
385             if node.tail is not None:
386                 yield None, node.tail, None
387             return
388
389         def fix_format(text):
390             #            separator = [u" ", u"\t", u".", u";", u","]
391             if isinstance(text, list):
392                 # need to join it first
393                 text = filter(lambda s: s is not None, content)
394                 text = u' '.join(text)
395                 # for i in range(len(text)):
396                 #     if i > 0:
397                 #         if text[i][0] not in separator\
398                 #             and text[i - 1][-1] not in separator:
399                 #          text.insert(i, u" ")
400
401             return re.sub("(?m)/$", "", text)
402
403         def add_part(snippets, **fields):
404             doc = self.create_book_doc(book)
405             for f in book_fields:
406                 doc.add(f)
407
408             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
409             doc.add(NumericField("header_span", Field.Store.YES, True)\
410                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
411             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
412
413             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
414                           Field.TermVector.WITH_POSITIONS_OFFSETS))
415
416             snip_pos = snippets.add(fields["content"])
417             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
418             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
419
420             if 'fragment_anchor' in fields:
421                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
422                               Field.Store.YES, Field.Index.NOT_ANALYZED))
423
424             if 'themes' in fields:
425                 themes, themes_pl = zip(*[
426                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
427                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
428                      for theme in fields['themes']])
429
430                 themes = self.add_gaps(themes, 'themes')
431                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
432
433                 for t in themes:
434                     doc.add(t)
435                 for t in themes_pl:
436                     doc.add(t)
437
438             return doc
439
440         def give_me_utf8(s):
441             if isinstance(s, unicode):
442                 return s.encode('utf-8')
443             else:
444                 return s
445
446         fragments = {}
447         snippets = Snippets(book.id).open('w')
448         try:
449             for header, position in zip(list(master), range(len(master))):
450
451                 if header.tag in self.skip_header_tags:
452                     continue
453                 if header.tag is etree.Comment:
454                     continue
455
456                 # section content
457                 content = []
458                 footnote = []
459
460                 def all_content(text):
461                     for frag in fragments.values():
462                         frag['content'].append(text)
463                     content.append(text)
464                 handle_text = [all_content]
465
466
467                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
468                     # handle footnotes
469                     if start is not None and start.tag in self.footnote_tags:
470                         footnote = []
471                         def collect_footnote(t):
472                             footnote.append(t)
473                         handle_text.append(collect_footnote)
474                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
475                         handle_text.pop()
476                         doc = add_part(snippets, header_index=position, header_type=header.tag,
477                                        content=u''.join(footnote),
478                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
479                 
480                         self.index.addDocument(doc)
481                         #print "@ footnote text: %s" % footnote
482                         footnote = []
483                     
484                     # handle fragments and themes.
485                     if start is not None and start.tag == 'begin':
486                         fid = start.attrib['id'][1:]
487                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
488
489                     # themes for this fragment
490                     elif start is not None and start.tag == 'motyw':
491                         fid = start.attrib['id'][1:]
492                         handle_text.append(None)
493                         if start.text is not None:
494                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
495                     elif end is not None and end.tag == 'motyw':
496                         handle_text.pop()
497
498                     elif start is not None and start.tag == 'end':
499                         fid = start.attrib['id'][1:]
500                         if fid not in fragments:
501                             continue  # a broken <end> node, skip it
502                         frag = fragments[fid]
503                         if frag['themes'] == []:
504                             continue  # empty themes list.
505                         del fragments[fid]
506
507                         doc = add_part(snippets,
508                                        header_type=frag['start_header'],
509                                        header_index=frag['start_section'],
510                                        header_span=position - frag['start_section'] + 1,
511                                        fragment_anchor=fid,
512                                        content=fix_format(frag['content']),
513                                        themes=frag['themes'])
514                         #print '@ FRAG %s' % frag['content']
515                         self.index.addDocument(doc)
516
517                         # Collect content.
518
519                     if text is not None and handle_text is not []:
520                         hdl = handle_text[-1]
521                         if hdl is not None:
522                             hdl(text)
523
524                         # in the end, add a section text.
525                 doc = add_part(snippets, header_index=position, header_type=header.tag,
526                                content=fix_format(content))
527                 #print '@ CONTENT: %s' % fix_format(content)
528
529                 self.index.addDocument(doc)
530
531         finally:
532             snippets.close()
533
534
535 def log_exception_wrapper(f):
536     def _wrap(*a):
537         try:
538             f(*a)
539         except Exception, e:
540             print("Error in indexing thread: %s" % e)
541             traceback.print_exc()
542             raise e
543     return _wrap
544
545
546 class ReusableIndex(Index):
547     """
548     Works like index, but does not close/optimize Lucene index
549     until program exit (uses atexit hook).
550     This is usefull for importbooks command.
551
552     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
553     """
554     index = None
555
556     def open(self, analyzer=None, threads=4):
557         if ReusableIndex.index:
558             self.index = ReusableIndex.index
559         else:
560             print("opening index")
561             Index.open(self, analyzer)
562             ReusableIndex.index = self.index
563             atexit.register(ReusableIndex.close_reusable)
564
565     # def index_book(self, *args, **kw):
566     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
567     #     ReusableIndex.pool_jobs.append(job)
568
569     @staticmethod
570     def close_reusable():
571         if ReusableIndex.index:
572             print("closing index")
573             ReusableIndex.index.optimize()
574             ReusableIndex.index.close()
575             ReusableIndex.index = None
576
577     def close(self):
578         if ReusableIndex.index:
579             ReusableIndex.index.commit()
580
581
582 class JoinSearch(object):
583     """
584     This mixin could be used to handle block join queries.
585     (currently unused)
586     """
587     def __init__(self, *args, **kw):
588         super(JoinSearch, self).__init__(*args, **kw)
589
590     def wrapjoins(self, query, fields=[]):
591         """
592         This functions modifies the query in a recursive way,
593         so Term and Phrase Queries contained, which match
594         provided fields are wrapped in a BlockJoinQuery,
595         and so delegated to children documents.
596         """
597         if BooleanQuery.instance_(query):
598             qs = BooleanQuery.cast_(query)
599             for clause in qs:
600                 clause = BooleanClause.cast_(clause)
601                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
602             return qs
603         else:
604             termset = HashSet()
605             query.extractTerms(termset)
606             for t in termset:
607                 t = Term.cast_(t)
608                 if t.field() not in fields:
609                     return query
610             return BlockJoinQuery(query, self.parent_filter,
611                                   BlockJoinQuery.ScoreMode.Total)
612
613     def bsearch(self, query, max_results=50):
614         q = self.query(query)
615         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
616
617         tops = self.searcher.search(bjq, max_results)
618         bks = []
619         for found in tops.scoreDocs:
620             doc = self.searcher.doc(found.doc)
621             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
622         return (bks, tops.totalHits)
623
624
625 class SearchResult(object):
626     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
627         if tokens_cache is None: tokens_cache = {}
628
629         if score:
630             self._score = score
631         else:
632             self._score = scoreDocs.score
633
634         self.boost = 1.0
635
636         self._hits = []
637         self._processed_hits = None  # processed hits
638
639         stored = search.searcher.doc(scoreDocs.doc)
640         self.book_id = int(stored.get("book_id"))
641
642         pd = stored.get("published_date")
643         if pd is None:
644             pd = 0
645         self.published_date = int(pd)
646
647         header_type = stored.get("header_type")
648         # we have a content hit in some header of fragment
649         if header_type is not None:
650             sec = (header_type, int(stored.get("header_index")))
651             header_span = stored.get('header_span')
652             header_span = header_span is not None and int(header_span) or 1
653
654             fragment = stored.get("fragment_anchor")
655
656             if snippets:
657                 snippets = snippets.replace("/\n", "\n")
658             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
659
660             self._hits.append(hit)
661
662         self.search = search
663         self.searched = searched
664         self.tokens_cache = tokens_cache
665
666     @property
667     def score(self):
668         return self._score * self.boost
669
670     def merge(self, other):
671         if self.book_id != other.book_id:
672             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
673         self._hits += other._hits
674         if other.score > self.score:
675             self._score = other._score
676         return self
677
678     def get_book(self):
679         return catalogue.models.Book.objects.get(id=self.book_id)
680
681     book = property(get_book)
682
683     @property
684     def hits(self):
685         if self._processed_hits is not None:
686             return self._processed_hits
687
688         POSITION = 0
689         FRAGMENT = 1
690         POSITION_INDEX = 1
691         POSITION_SPAN = 2
692         SCORE = 2
693         OTHER = 3
694
695         # to sections and fragments
696         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
697         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
698         sect = filter(lambda s: 0 == len(filter(
699             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
700             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
701             frags)), sect)
702
703         hits = []
704
705         # remove duplicate fragments
706         fragments = {}
707         for f in frags:
708             fid = f[FRAGMENT]
709             if fid in fragments:
710                 if fragments[fid][SCORE] >= f[SCORE]:
711                     continue
712             fragments[fid] = f
713         frags = fragments.values()
714
715         # remove duplicate sections
716         sections = {}
717
718         for s in sect:
719             si = s[POSITION][POSITION_INDEX]
720             # skip existing
721             if si in sections:
722                 if sections[si]['score'] >= s[SCORE]:
723                     continue
724
725             m = {'score': s[SCORE],
726                  'section_number': s[POSITION][POSITION_INDEX] + 1,
727                  }
728             m.update(s[OTHER])
729             sections[si] = m
730
731         hits = sections.values()
732
733         for f in frags:
734             try:
735                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
736             except catalogue.models.Fragment.DoesNotExist:
737                 # stale index
738                 continue
739
740             # Figure out if we were searching for a token matching some word in theme name.
741             themes = frag.tags.filter(category='theme')
742             themes_hit = []
743             if self.searched is not None:
744                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
745                 for theme in themes:
746                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
747                     for t in tokens:
748                         if t in name_tokens:
749                             if not theme in themes_hit:
750                                 themes_hit.append(theme)
751                             break
752
753             m = {'score': f[SCORE],
754                  'fragment': frag,
755                  'section_number': f[POSITION][POSITION_INDEX] + 1,
756                  'themes': themes,
757                  'themes_hit': themes_hit
758                  }
759             m.update(f[OTHER])
760             hits.append(m)
761
762         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
763
764         self._processed_hits = hits
765
766         return hits
767
768     def __unicode__(self):
769         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
770
771     @staticmethod
772     def aggregate(*result_lists):
773         books = {}
774         for rl in result_lists:
775             for r in rl:
776                 if r.book_id in books:
777                     books[r.book_id].merge(r)
778                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
779                 else:
780                     books[r.book_id] = r
781         return books.values()
782
783     def __cmp__(self, other):
784         c = cmp(self.score, other.score)
785         if c == 0:
786             # this is inverted, because earlier date is better
787             return cmp(other.published_date, self.published_date)
788         else:
789             return c
790
791
792 class Hint(object):
793     """
794     Given some hint information (information we already know about)
795     our search target - like author, title (specific book), epoch, genre, kind
796     we can narrow down search using filters.
797     """
798     def __init__(self, search):
799         """
800         Accepts a Searcher instance.
801         """
802         self.search = search
803         self.book_tags = {}
804         self.part_tags = []
805         self._books = []
806
807     def books(self, *books):
808         """
809         Give a hint that we search these books.
810         """
811         self._books = books
812
813     def tags(self, tags):
814         """
815         Give a hint that these Tag objects (a list of)
816         is necessary.
817         """
818         for t in tags:
819             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
820                 lst = self.book_tags.get(t.category, [])
821                 lst.append(t)
822                 self.book_tags[t.category] = lst
823             if t.category in ['theme', 'theme_pl']:
824                 self.part_tags.append(t)
825
826     def tag_filter(self, tags, field='tags'):
827         """
828         Given a lsit of tags and an optional field (but they are normally in tags field)
829         returns a filter accepting only books with specific tags.
830         """
831         q = BooleanQuery()
832
833         for tag in tags:
834             toks = self.search.get_tokens(tag.name, field=field)
835             tag_phrase = PhraseQuery()
836             for tok in toks:
837                 tag_phrase.add(Term(field, tok))
838             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
839
840         return QueryWrapperFilter(q)
841
842     def book_filter(self):
843         """
844         Filters using book tags (all tag kinds except a theme)
845         """
846         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
847         if tags:
848             return self.tag_filter(tags)
849         else:
850             return None
851
852     def part_filter(self):
853         """
854         This filter can be used to look for book parts.
855         It filters on book id and/or themes.
856         """
857         fs = []
858         if self.part_tags:
859             fs.append(self.tag_filter(self.part_tags, field='themes'))
860
861         if self._books != []:
862             bf = BooleanFilter()
863             for b in self._books:
864                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
865                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
866             fs.append(bf)
867
868         return Search.chain_filters(fs)
869
870     def should_search_for_book(self):
871         return self._books == []
872
873     def just_search_in(self, all):
874         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
875         some = []
876         for field in all:
877             if field == 'authors' and 'author' in self.book_tags:
878                 continue
879             if field == 'title' and self._books != []:
880                 continue
881             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
882                 continue
883             some.append(field)
884         return some
885
886
887 class Search(IndexStore):
888     """
889     Search facilities.
890     """
891     def __init__(self, default_field="content"):
892         IndexStore.__init__(self)
893         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
894         # self.analyzer = WLAnalyzer()
895         self.searcher = IndexSearcher(self.store, True)
896         self.parser = QueryParser(Version.LUCENE_34, default_field,
897                                   self.analyzer)
898
899         self.parent_filter = TermsFilter()
900         self.parent_filter.addTerm(Term("is_book", "true"))
901
902     def query(self, query):
903         """Parse query in default Lucene Syntax. (for humans)
904         """
905         return self.parser.parse(query)
906
907     def simple_search(self, query, max_results=50):
908         """Runs a query for books using lucene syntax. (for humans)
909         Returns (books, total_hits)
910         """
911
912         tops = self.searcher.search(self.query(query), max_results)
913         bks = []
914         for found in tops.scoreDocs:
915             doc = self.searcher.doc(found.doc)
916             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
917         return (bks, tops.totalHits)
918
919     def get_tokens(self, searched, field='content', cached=None):
920         """returns tokens analyzed by a proper (for a field) analyzer
921         argument can be: StringReader, string/unicode, or tokens. In the last case
922         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
923         """
924         if cached is not None and field in cached:
925             return cached[field]
926
927         if isinstance(searched, str) or isinstance(searched, unicode):
928             searched = StringReader(searched)
929         elif isinstance(searched, list):
930             return searched
931
932         searched.reset()
933         tokens = self.analyzer.reusableTokenStream(field, searched)
934         toks = []
935         while tokens.incrementToken():
936             cta = tokens.getAttribute(CharTermAttribute.class_)
937             toks.append(cta.toString())
938
939         if cached is not None:
940             cached[field] = toks
941
942         return toks
943
944     def fuzziness(self, fuzzy):
945         """Helper method to sanitize fuzziness"""
946         if not fuzzy:
947             return None
948         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
949             return fuzzy
950         else:
951             return 0.5
952
953     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
954         """
955         Return a PhraseQuery with a series of tokens.
956         """
957         if fuzzy:
958             phrase = MultiPhraseQuery()
959             for t in tokens:
960                 term = Term(field, t)
961                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
962                 fuzzterms = []
963
964                 while True:
965                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
966                     ft = fuzzterm.term()
967                     if ft:
968                         fuzzterms.append(ft)
969                     if not fuzzterm.next(): break
970                 if fuzzterms:
971                     phrase.add(JArray('object')(fuzzterms, Term))
972                 else:
973                     phrase.add(term)
974         else:
975             phrase = PhraseQuery()
976             phrase.setSlop(slop)
977             for t in tokens:
978                 term = Term(field, t)
979                 phrase.add(term)
980         return phrase
981
982     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
983         """
984         Returns term queries joined by boolean query.
985         modal - applies to boolean query
986         fuzzy - should the query by fuzzy.
987         """
988         q = BooleanQuery()
989         for t in tokens:
990             term = Term(field, t)
991             if fuzzy:
992                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
993             else:
994                 term = TermQuery(term)
995             q.add(BooleanClause(term, modal))
996         return q
997
998     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
999                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1000         if filters is None: filters = []
1001         if tokens_cache is None: tokens_cache = {}
1002
1003         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1004
1005         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1006         if book:
1007             filters.append(self.term_filter(Term('is_book', 'true')))
1008         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1009
1010         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1011
1012     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1013                     filters=None, tokens_cache=None, boost=None, snippets=True):
1014         if filters is None: filters = []
1015         if tokens_cache is None: tokens_cache = {}
1016
1017         if book:
1018             filters.append(self.term_filter(Term('is_book', 'true')))
1019
1020         query = BooleanQuery()
1021
1022         for fld in fields:
1023             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1024
1025             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1026                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1027
1028         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1029
1030         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1031                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1032
1033     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1034         """
1035         Search for perfect book matches. Just see if the query matches with some author or title,
1036         taking hints into account.
1037         """
1038         fields_to_search = ['authors', 'title']
1039         only_in = None
1040         if hint:
1041             if not hint.should_search_for_book():
1042                 return []
1043             fields_to_search = hint.just_search_in(fields_to_search)
1044             only_in = hint.book_filter()
1045
1046         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1047
1048         books = []
1049         for q in qrys:
1050             top = self.searcher.search(q,
1051                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1052                 max_results)
1053             for found in top.scoreDocs:
1054                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1055         return books
1056
1057     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1058         fields_to_search = ['tags', 'authors', 'title']
1059
1060         only_in = None
1061         if hint:
1062             if not hint.should_search_for_book():
1063                 return []
1064             fields_to_search = hint.just_search_in(fields_to_search)
1065             only_in = hint.book_filter()
1066
1067         tokens = self.get_tokens(searched, field='SIMPLE')
1068
1069         q = BooleanQuery()
1070
1071         for fld in fields_to_search:
1072             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1073                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1074
1075         books = []
1076         top = self.searcher.search(q,
1077                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1078             max_results)
1079         for found in top.scoreDocs:
1080             books.append(SearchResult(self, found, how_found="search_book"))
1081
1082         return books
1083
1084     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1085         """
1086         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1087         some part/fragment of the book.
1088         """
1089         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1090
1091         flt = None
1092         if hint:
1093             flt = hint.part_filter()
1094
1095         books = []
1096         for q in qrys:
1097             top = self.searcher.search(q,
1098                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1099                                                            flt]),
1100                                        max_results)
1101             for found in top.scoreDocs:
1102                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1103
1104         return books
1105
1106     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1107         """
1108         Tries to use search terms to match different fields of book (or its parts).
1109         E.g. one word can be an author survey, another be a part of the title, and the rest
1110         are some words from third chapter.
1111         """
1112         if tokens_cache is None: tokens_cache = {}
1113         books = []
1114         only_in = None
1115
1116         if hint:
1117             only_in = hint.part_filter()
1118
1119         # content only query : themes x content
1120         q = BooleanQuery()
1121
1122         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1123         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1124
1125         # only search in themes when we do not already filter by themes
1126         if hint is None or hint.just_search_in(['themes']) != []:
1127             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1128                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1129
1130         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1131                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1132
1133         topDocs = self.searcher.search(q, only_in, max_results)
1134         for found in topDocs.scoreDocs:
1135             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1136             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1137
1138         # query themes/content x author/title/tags
1139         q = BooleanQuery()
1140         in_content = BooleanQuery()
1141         in_meta = BooleanQuery()
1142
1143         for fld in ['themes_pl', 'content']:
1144             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1145
1146         for fld in ['tags', 'authors', 'title']:
1147             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1148
1149         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1150         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1151
1152         topDocs = self.searcher.search(q, only_in, max_results)
1153         for found in topDocs.scoreDocs:
1154             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1155             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1156
1157         return books
1158
1159     # def multisearch(self, query, max_results=50):
1160     #     """
1161     #     Search strategy:
1162     #     - (phrase) OR -> content
1163     #                   -> title
1164     #                   -> authors
1165     #     - (keywords)  -> authors
1166     #                   -> motyw
1167     #                   -> tags
1168     #                   -> content
1169     #     """
1170         # queryreader = StringReader(query)
1171         # tokens = self.get_tokens(queryreader)
1172
1173         # top_level = BooleanQuery()
1174         # Should = BooleanClause.Occur.SHOULD
1175
1176         # phrase_level = BooleanQuery()
1177         # phrase_level.setBoost(1.3)
1178
1179         # p_content = self.make_phrase(tokens, joined=True)
1180         # p_title = self.make_phrase(tokens, 'title')
1181         # p_author = self.make_phrase(tokens, 'author')
1182
1183         # phrase_level.add(BooleanClause(p_content, Should))
1184         # phrase_level.add(BooleanClause(p_title, Should))
1185         # phrase_level.add(BooleanClause(p_author, Should))
1186
1187         # kw_level = BooleanQuery()
1188
1189         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1190         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1191         # kw_level.add(j_themes, Should)
1192         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1193         # j_con = self.make_term_query(tokens, joined=True)
1194         # kw_level.add(j_con, Should)
1195
1196         # top_level.add(BooleanClause(phrase_level, Should))
1197         # top_level.add(BooleanClause(kw_level, Should))
1198
1199         # return None
1200
1201     def get_snippets(self, scoreDoc, query, field='content'):
1202         """
1203         Returns a snippet for found scoreDoc.
1204         """
1205         htmlFormatter = SimpleHTMLFormatter()
1206         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1207
1208         stored = self.searcher.doc(scoreDoc.doc)
1209
1210         position = stored.get('snippets_position')
1211         length = stored.get('snippets_length')
1212         if position is None or length is None:
1213             return None
1214         # locate content.
1215         snippets = Snippets(stored.get('book_id')).open()
1216         try:
1217             text = snippets.get((int(position),
1218                                  int(length)))
1219         finally:
1220             snippets.close()
1221
1222         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1223         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1224         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1225
1226         return snip
1227
1228     @staticmethod
1229     def enum_to_array(enum):
1230         """
1231         Converts a lucene TermEnum to array of Terms, suitable for
1232         addition to queries
1233         """
1234         terms = []
1235
1236         while True:
1237             t = enum.term()
1238             if t:
1239                 terms.append(t)
1240             if not enum.next(): break
1241
1242         if terms:
1243             return JArray('object')(terms, Term)
1244
1245     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1246         """
1247         Search for Tag objects using query.
1248         """
1249         if not pdcounter:
1250             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1251         tops = self.searcher.search(query, filters, max_results)
1252
1253         tags = []
1254         for found in tops.scoreDocs:
1255             doc = self.searcher.doc(found.doc)
1256             is_pdcounter = doc.get('is_pdcounter')
1257             if is_pdcounter:
1258                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1259             else:
1260                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1261                 # don't add the pdcounter tag if same tag already exists
1262             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1263                 tags.append(tag)
1264                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1265         print 'returning %s' % tags
1266         return tags
1267
1268     def search_books(self, query, filter=None, max_results=10):
1269         """
1270         Searches for Book objects using query
1271         """
1272         bks = []
1273         tops = self.searcher.search(query, filter, max_results)
1274         for found in tops.scoreDocs:
1275             doc = self.searcher.doc(found.doc)
1276             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1277         return bks
1278
1279     def make_prefix_phrase(self, toks, field):
1280         q = MultiPhraseQuery()
1281         for i in range(len(toks)):
1282             t = Term(field, toks[i])
1283             if i == len(toks) - 1:
1284                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1285                 if pterms:
1286                     q.add(pterms)
1287                 else:
1288                     q.add(t)
1289             else:
1290                 q.add(t)
1291         return q
1292
1293     @staticmethod
1294     def term_filter(term, inverse=False):
1295         only_term = TermsFilter()
1296         only_term.addTerm(term)
1297
1298         if inverse:
1299             neg = BooleanFilter()
1300             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1301             only_term = neg
1302
1303         return only_term
1304
1305     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1306         """
1307         Return auto-complete hints for tags
1308         using prefix search.
1309         """
1310         toks = self.get_tokens(string, field='SIMPLE')
1311         top = BooleanQuery()
1312
1313         for field in ['tag_name', 'tag_name_pl']:
1314             if prefix:
1315                 q = self.make_prefix_phrase(toks, field)
1316             else:
1317                 q = self.make_term_query(toks, field)
1318             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1319
1320         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1321
1322         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1323
1324     def hint_books(self, string, max_results=50, prefix=True):
1325         """
1326         Returns auto-complete hints for book titles
1327         Because we do not index 'pseudo' title-tags.
1328         Prefix search.
1329         """
1330         toks = self.get_tokens(string, field='SIMPLE')
1331
1332         if prefix:
1333             q = self.make_prefix_phrase(toks, 'title')
1334         else:
1335             q = self.make_term_query(toks, 'title')
1336
1337         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1338
1339     @staticmethod
1340     def chain_filters(filters, op=ChainedFilter.AND):
1341         """
1342         Chains a filter list together
1343         """
1344         filters = filter(lambda x: x is not None, filters)
1345         if not filters or filters is []:
1346             return None
1347         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1348         return chf
1349
1350     def filtered_categories(self, tags):
1351         """
1352         Return a list of tag categories, present in tags list.
1353         """
1354         cats = {}
1355         for t in tags:
1356             cats[t.category] = True
1357         return cats.keys()
1358
1359     def hint(self):
1360         return Hint(self)