optimizeindex removes documents for deleted books
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from django.dispatch import Signal
5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
6     File, Field, Integer, \
7     NumericField, Version, Document, JavaError, IndexSearcher, \
8     QueryParser, PerFieldAnalyzerWrapper, \
9     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
10     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
11     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
12     HashSet, BooleanClause, Term, CharTermAttribute, \
13     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
14     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
15     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
16     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
17     initVM, CLASSPATH, JArray, JavaError
18     # KeywordAnalyzer
19
20 # Initialize jvm
21 JVM = initVM(CLASSPATH)
22
23 import sys
24 import os
25 import re
26 import errno
27 from librarian import dcparser
28 from librarian.parser import WLDocument
29 from lxml import etree
30 import catalogue.models
31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
32 from multiprocessing.pool import ThreadPool
33 from threading import current_thread
34 from itertools import chain
35 import atexit
36 import traceback
37 import logging
38 log = logging.getLogger('search')
39
40 class WLAnalyzer(PerFieldAnalyzerWrapper):
41     def __init__(self):
42         polish = PolishAnalyzer(Version.LUCENE_34)
43         #        polish_gap.setPositionIncrementGap(999)
44
45         simple = SimpleAnalyzer(Version.LUCENE_34)
46         #        simple_gap.setPositionIncrementGap(999)
47
48         keyword = KeywordAnalyzer(Version.LUCENE_34)
49
50         # not sure if needed: there's NOT_ANALYZED meaning basically the same
51
52         PerFieldAnalyzerWrapper.__init__(self, polish)
53
54         self.addAnalyzer("tags", simple)
55         self.addAnalyzer("technical_editors", simple)
56         self.addAnalyzer("editors", simple)
57         self.addAnalyzer("url", keyword)
58         self.addAnalyzer("source_url", keyword)
59         self.addAnalyzer("source_name", simple)
60         self.addAnalyzer("publisher", simple)
61         self.addAnalyzer("authors", simple)
62         self.addAnalyzer("title", simple)
63
64         self.addAnalyzer("is_book", keyword)
65         # shouldn't the title have two forms? _pl and simple?
66
67         self.addAnalyzer("themes", simple)
68         self.addAnalyzer("themes_pl", polish)
69
70         self.addAnalyzer("tag_name", simple)
71         self.addAnalyzer("tag_name_pl", polish)
72
73         self.addAnalyzer("translators", simple)
74
75         self.addAnalyzer("KEYWORD", keyword)
76         self.addAnalyzer("SIMPLE", simple)
77         self.addAnalyzer("POLISH", polish)
78
79
80 class IndexStore(object):
81     """
82     Provides access to search index.
83
84     self.store - lucene index directory
85     """
86     def __init__(self):
87         self.make_index_dir()
88         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
89
90     def make_index_dir(self):
91         try:
92             os.makedirs(settings.SEARCH_INDEX)
93         except OSError as exc:
94             if exc.errno == errno.EEXIST:
95                 pass
96             else: raise
97
98     def close(self):
99         self.store.close()
100
101
102 class IndexChecker(IndexStore):
103     def __init__(self):
104         IndexStore.__init__(self)
105
106     def check(self):
107         checker = CheckIndex(self.store)
108         status = checker.checkIndex()
109         return status
110
111
112 class Snippets(object):
113     """
114     This class manages snippet files for indexed object (book)
115     the snippets are concatenated together, and their positions and
116     lengths are kept in lucene index fields.
117     """
118     SNIPPET_DIR = "snippets"
119
120     def __init__(self, book_id, revision=None):
121         try:
122             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
123         except OSError as exc:
124             if exc.errno == errno.EEXIST:
125                 pass
126             else: raise
127         self.book_id = book_id
128         self.revision = revision
129         self.file = None
130
131     @property
132     def path(self):
133         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
134         else: fn = "%d" % self.book_id
135
136         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
137
138     def open(self, mode='r'):
139         """
140         Open the snippet file. Call .close() afterwards.
141         """
142         if not 'b' in mode:
143             mode += 'b'
144
145         if 'w' in mode:
146             if os.path.exists(self.path):
147                 self.revision = 1
148                 while True:
149                     if not os.path.exists(self.path):
150                         break
151                     self.revision += 1
152
153         self.file = open(self.path, mode)
154         self.position = 0
155         return self
156
157     def add(self, snippet):
158         """
159         Append a snippet (unicode) to the snippet file.
160         Return a (position, length) tuple
161         """
162         txt = snippet.encode('utf-8')
163         l = len(txt)
164         self.file.write(txt)
165         pos = (self.position, l)
166         self.position += l
167         return pos
168
169     def get(self, pos):
170         """
171         Given a tuple of (position, length) return an unicode
172         of the snippet stored there.
173         """
174         self.file.seek(pos[0], 0)
175         txt = self.file.read(pos[1]).decode('utf-8')
176         return txt
177
178     def close(self):
179         """Close snippet file"""
180         self.file.close()
181
182     def remove(self):
183         self.revision = None
184         try:
185             os.unlink(self.path)
186             self.revision = 0
187             while True:
188                 self.revision += 1
189                 os.unlink(self.path)
190         except OSError:
191             pass
192
193
194 class BaseIndex(IndexStore):
195     """
196     Base index class.
197     Provides basic operations on index: opening, closing, optimizing.
198     """
199     def __init__(self, analyzer=None):
200         super(BaseIndex, self).__init__()
201         self.index = None
202         if not analyzer:
203             analyzer = WLAnalyzer()
204         self.analyzer = analyzer
205
206     def open(self, timeout=None):
207         if self.index:
208             raise Exception("Index is already opened")
209         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
210         if timeout:
211             conf.setWriteLockTimeout(long(timeout))
212         self.index = IndexWriter(self.store, conf)
213         return self.index
214
215     def optimize(self):
216         self.index.optimize()
217
218     def close(self):
219         try:
220             self.index.optimize()
221         except JavaError, je:
222             log.error("Error during optimize phase, check index: %s" % je)
223
224         self.index.close()
225         self.index = None
226
227         index_changed.send_robust(self)
228
229         super(BaseIndex, self).close()
230
231     def __enter__(self):
232         self.open()
233         return self
234
235     def __exit__(self, type, value, tb):
236         self.close()
237
238
239 index_changed = Signal()
240
241
242 class Index(BaseIndex):
243     """
244     Class indexing books.
245     """
246     def __init__(self, analyzer=None):
247         super(Index, self).__init__(analyzer)
248
249     def index_tags(self, *tags, **kw):
250         """
251         Re-index global tag list.
252         Removes all tags from index, then index them again.
253         Indexed fields include: id, name (with and without polish stems), category
254         """
255         remove_only = kw.get('remove_only', False)
256         # first, remove tags from index.
257         if tags:
258             q = BooleanQuery()
259             for tag in tags:
260                 b_id_cat = BooleanQuery()
261
262                 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
263                 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
264
265                 if isinstance(tag, PDCounterAuthor):
266                     q_cat = TermQuery(Term('tag_category', 'pd_author'))
267                 elif isinstance(tag, PDCounterBook):
268                     q_cat = TermQuery(Term('tag_category', 'pd_book'))
269                 else:
270                     q_cat = TermQuery(Term('tag_category', tag.category))
271                 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
272
273                 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
274         else:  # all
275             q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
276             self.index.deleteDocuments(q)
277
278         if not remove_only:
279             # then add them [all or just one passed]
280             if not tags:
281                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
282                     PDCounterAuthor.objects.all(), \
283                     PDCounterBook.objects.all())
284
285             for tag in tags:
286                 if isinstance(tag, PDCounterAuthor):
287                     doc = Document()
288                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
289                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
290                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
291                     doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
292                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
293                     self.index.addDocument(doc)
294                 elif isinstance(tag, PDCounterBook):
295                     doc = Document()
296                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
297                     doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
298                     doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
299                     doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
300                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
301                     self.index.addDocument(doc)
302                 else:
303                     doc = Document()
304                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
305                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
306                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
307                     doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
308                     self.index.addDocument(doc)
309
310     def create_book_doc(self, book):
311         """
312         Create a lucene document referring book id.
313         """
314         doc = Document()
315         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
316         if book.parent is not None:
317             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
318         return doc
319
320     def remove_book(self, book_or_id, remove_snippets=True):
321         """Removes a book from search index.
322         book - Book instance."""
323         if isinstance(book_or_id, catalogue.models.Book):
324             book_id = book_or_id.id
325         else:
326             book_id = book_or_id
327
328         q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
329         self.index.deleteDocuments(q)
330
331         if remove_snippets:
332             snippets = Snippets(book_id)
333             snippets.remove()
334
335     def index_book(self, book, book_info=None, overwrite=True):
336         """
337         Indexes the book.
338         Creates a lucene document for extracted metadata
339         and calls self.index_content() to index the contents of the book.
340         """
341         if overwrite:
342             # we don't remove snippets, since they might be still needed by
343             # threads using not reopened index
344             self.remove_book(book, remove_snippets=False)
345
346         book_doc = self.create_book_doc(book)
347         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
348         # let's not index it - it's only used for extracting publish date
349         del meta_fields['source_name']
350         
351         for f in meta_fields.values():
352             if isinstance(f, list) or isinstance(f, tuple):
353                 for elem in f:
354                     book_doc.add(elem)
355             else:
356                 book_doc.add(f)
357         self.index.addDocument(book_doc)
358         del book_doc
359
360         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
361
362     master_tags = [
363         'opowiadanie',
364         'powiesc',
365         'dramat_wierszowany_l',
366         'dramat_wierszowany_lp',
367         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
368         'wywiad',
369         ]
370
371     ignore_content_tags = [
372         'uwaga', 'extra',
373         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
374         'didaskalia',
375         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
376         ]
377
378     footnote_tags = ['pa', 'pt', 'pr', 'pe']
379
380     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
381
382     published_date_re = re.compile("([0-9]+)[\]. ]*$")
383
384     def extract_metadata(self, book, book_info=None, dc_only=None):
385         """
386         Extract metadata from book and returns a map of fields keyed by fieldname
387         """
388         fields = {}
389
390         if book_info is None:
391             book_info = dcparser.parse(open(book.xml_file.path))
392
393         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
394         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
395         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
396
397         # validator, name
398         for field in dcparser.BookInfo.FIELDS:
399             if dc_only and field.name not in dc_only:
400                 continue
401             if hasattr(book_info, field.name):
402                 if not getattr(book_info, field.name):
403                     continue
404                 # since no type information is available, we use validator
405                 type_indicator = field.validator
406                 if type_indicator == dcparser.as_unicode:
407                     s = getattr(book_info, field.name)
408                     if field.multiple:
409                         s = ', '.join(s)
410                     try:
411                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
412                     except JavaError as je:
413                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
414                 elif type_indicator == dcparser.as_person:
415                     p = getattr(book_info, field.name)
416                     if isinstance(p, dcparser.Person):
417                         persons = unicode(p)
418                     else:
419                         persons = ', '.join(map(unicode, p))
420                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
421                 elif type_indicator == dcparser.as_date:
422                     dt = getattr(book_info, field.name)
423                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
424                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
425
426         # get published date
427         pd = None
428         if hasattr(book_info, 'source_name') and book_info.source_name:
429             match = self.published_date_re.search(book_info.source_name)
430             if match is not None:
431                 pd = str(match.groups()[0])
432         if not pd: pd = ""
433         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
434
435         return fields
436
437     def add_gaps(self, fields, fieldname):
438         """
439         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
440         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
441         """
442         def gap():
443             while True:
444                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
445         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
446
447     def get_master(self, root):
448         """
449         Returns the first master tag from an etree.
450         """
451         for master in root.iter():
452             if master.tag in self.master_tags:
453                 return master
454
455     def index_content(self, book, book_fields=[]):
456         """
457         Walks the book XML and extract content from it.
458         Adds parts for each header tag and for each fragment.
459         """
460         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
461         root = wld.edoc.getroot()
462
463         master = self.get_master(root)
464         if master is None:
465             return []
466
467         def walker(node, ignore_tags=[]):
468
469             if node.tag not in ignore_tags:
470                 yield node, None, None
471                 if node.text is not None:
472                     yield None, node.text, None
473                 for child in list(node):
474                     for b, t, e in walker(child):
475                         yield b, t, e
476                 yield None, None, node
477
478             if node.tail is not None:
479                 yield None, node.tail, None
480             return
481
482         def fix_format(text):
483             #            separator = [u" ", u"\t", u".", u";", u","]
484             if isinstance(text, list):
485                 # need to join it first
486                 text = filter(lambda s: s is not None, content)
487                 text = u' '.join(text)
488                 # for i in range(len(text)):
489                 #     if i > 0:
490                 #         if text[i][0] not in separator\
491                 #             and text[i - 1][-1] not in separator:
492                 #          text.insert(i, u" ")
493
494             return re.sub("(?m)/$", "", text)
495
496         def add_part(snippets, **fields):
497             doc = self.create_book_doc(book)
498             for f in book_fields:
499                 doc.add(f)
500
501             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
502             doc.add(NumericField("header_span", Field.Store.YES, True)\
503                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
504             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
505
506             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
507                           Field.TermVector.WITH_POSITIONS_OFFSETS))
508
509             snip_pos = snippets.add(fields["content"])
510             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
511             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
512             if snippets.revision:
513                 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
514
515             if 'fragment_anchor' in fields:
516                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
517                               Field.Store.YES, Field.Index.NOT_ANALYZED))
518
519             if 'themes' in fields:
520                 themes, themes_pl = zip(*[
521                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
522                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
523                      for theme in fields['themes']])
524
525                 themes = self.add_gaps(themes, 'themes')
526                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
527
528                 for t in themes:
529                     doc.add(t)
530                 for t in themes_pl:
531                     doc.add(t)
532
533             return doc
534
535         def give_me_utf8(s):
536             if isinstance(s, unicode):
537                 return s.encode('utf-8')
538             else:
539                 return s
540
541         fragments = {}
542         snippets = Snippets(book.id).open('w')
543         try:
544             for header, position in zip(list(master), range(len(master))):
545
546                 if header.tag in self.skip_header_tags:
547                     continue
548                 if header.tag is etree.Comment:
549                     continue
550
551                 # section content
552                 content = []
553                 footnote = []
554
555                 def all_content(text):
556                     for frag in fragments.values():
557                         frag['content'].append(text)
558                     content.append(text)
559                 handle_text = [all_content]
560
561
562                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
563                     # handle footnotes
564                     if start is not None and start.tag in self.footnote_tags:
565                         footnote = []
566                         def collect_footnote(t):
567                             footnote.append(t)
568                         handle_text.append(collect_footnote)
569                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
570                         handle_text.pop()
571                         doc = add_part(snippets, header_index=position, header_type=header.tag,
572                                        content=u''.join(footnote),
573                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
574                 
575                         self.index.addDocument(doc)
576                         #print "@ footnote text: %s" % footnote
577                         footnote = []
578                     
579                     # handle fragments and themes.
580                     if start is not None and start.tag == 'begin':
581                         fid = start.attrib['id'][1:]
582                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
583
584                     # themes for this fragment
585                     elif start is not None and start.tag == 'motyw':
586                         fid = start.attrib['id'][1:]
587                         handle_text.append(None)
588                         if start.text is not None:
589                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
590                     elif end is not None and end.tag == 'motyw':
591                         handle_text.pop()
592
593                     elif start is not None and start.tag == 'end':
594                         fid = start.attrib['id'][1:]
595                         if fid not in fragments:
596                             continue  # a broken <end> node, skip it
597                         frag = fragments[fid]
598                         if frag['themes'] == []:
599                             continue  # empty themes list.
600                         del fragments[fid]
601
602                         doc = add_part(snippets,
603                                        header_type=frag['start_header'],
604                                        header_index=frag['start_section'],
605                                        header_span=position - frag['start_section'] + 1,
606                                        fragment_anchor=fid,
607                                        content=fix_format(frag['content']),
608                                        themes=frag['themes'])
609                         #print '@ FRAG %s' % frag['content']
610                         self.index.addDocument(doc)
611
612                         # Collect content.
613
614                     if text is not None and handle_text is not []:
615                         hdl = handle_text[-1]
616                         if hdl is not None:
617                             hdl(text)
618
619                         # in the end, add a section text.
620                 doc = add_part(snippets, header_index=position, header_type=header.tag,
621                                content=fix_format(content))
622                 #print '@ CONTENT: %s' % fix_format(content)
623
624                 self.index.addDocument(doc)
625
626         finally:
627             snippets.close()
628
629
630 def log_exception_wrapper(f):
631     def _wrap(*a):
632         try:
633             f(*a)
634         except Exception, e:
635             log.error("Error in indexing thread: %s" % e)
636             traceback.print_exc()
637             raise e
638     return _wrap
639
640
641 class ReusableIndex(Index):
642     """
643     Works like index, but does not close/optimize Lucene index
644     until program exit (uses atexit hook).
645     This is usefull for importbooks command.
646
647     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
648     """
649     index = None
650
651     def open(self, analyzer=None, **kw):
652         if ReusableIndex.index:
653             self.index = ReusableIndex.index
654         else:
655             Index.open(self, analyzer, **kw)
656             ReusableIndex.index = self.index
657             atexit.register(ReusableIndex.close_reusable)
658
659     # def index_book(self, *args, **kw):
660     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
661     #     ReusableIndex.pool_jobs.append(job)
662
663     @staticmethod
664     def close_reusable():
665         if ReusableIndex.index:
666             ReusableIndex.index.optimize()
667             ReusableIndex.index.close()
668             ReusableIndex.index = None
669
670             index_changed.send_robust(None)
671
672     def close(self):
673         if ReusableIndex.index:
674             ReusableIndex.index.commit()
675
676
677 class JoinSearch(object):
678     """
679     This mixin could be used to handle block join queries.
680     (currently unused)
681     """
682     def __init__(self, *args, **kw):
683         super(JoinSearch, self).__init__(*args, **kw)
684
685     def wrapjoins(self, query, fields=[]):
686         """
687         This functions modifies the query in a recursive way,
688         so Term and Phrase Queries contained, which match
689         provided fields are wrapped in a BlockJoinQuery,
690         and so delegated to children documents.
691         """
692         if BooleanQuery.instance_(query):
693             qs = BooleanQuery.cast_(query)
694             for clause in qs:
695                 clause = BooleanClause.cast_(clause)
696                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
697             return qs
698         else:
699             termset = HashSet()
700             query.extractTerms(termset)
701             for t in termset:
702                 t = Term.cast_(t)
703                 if t.field() not in fields:
704                     return query
705             return BlockJoinQuery(query, self.parent_filter,
706                                   BlockJoinQuery.ScoreMode.Total)
707
708     def bsearch(self, query, max_results=50):
709         q = self.query(query)
710         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
711
712         tops = self.searcher.search(bjq, max_results)
713         bks = []
714         for found in tops.scoreDocs:
715             doc = self.searcher.doc(found.doc)
716             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
717         return (bks, tops.totalHits)
718
719
720 class SearchResult(object):
721     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
722         if tokens_cache is None: tokens_cache = {}
723
724         if score:
725             self._score = score
726         else:
727             self._score = scoreDocs.score
728
729         self.boost = 1.0
730
731         self._hits = []
732         self._processed_hits = None  # processed hits
733
734         stored = search.searcher.doc(scoreDocs.doc)
735         self.book_id = int(stored.get("book_id"))
736
737         pd = stored.get("published_date")
738         try:
739             self.published_date = int(pd)
740         except ValueError:
741             self.published_date = 0
742
743         header_type = stored.get("header_type")
744         # we have a content hit in some header of fragment
745         if header_type is not None:
746             sec = (header_type, int(stored.get("header_index")))
747             header_span = stored.get('header_span')
748             header_span = header_span is not None and int(header_span) or 1
749
750             fragment = stored.get("fragment_anchor")
751
752             if snippets:
753                 snippets = snippets.replace("/\n", "\n")
754             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
755
756             self._hits.append(hit)
757
758         self.search = search
759         self.searched = searched
760         self.tokens_cache = tokens_cache
761
762     @property
763     def score(self):
764         return self._score * self.boost
765
766     def merge(self, other):
767         if self.book_id != other.book_id:
768             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
769         self._hits += other._hits
770         if other.score > self.score:
771             self._score = other._score
772         return self
773
774     def get_book(self):
775         if hasattr(self, '_book'):
776             return self._book
777         return catalogue.models.Book.objects.get(id=self.book_id)
778
779     book = property(get_book)
780
781     @property
782     def hits(self):
783         if self._processed_hits is not None:
784             return self._processed_hits
785
786         POSITION = 0
787         FRAGMENT = 1
788         POSITION_INDEX = 1
789         POSITION_SPAN = 2
790         SCORE = 2
791         OTHER = 3
792
793         # to sections and fragments
794         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
795
796         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
797
798         # sections not covered by fragments
799         sect = filter(lambda s: 0 == len(filter(
800             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
801             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
802             frags)), sect)
803
804         hits = []
805
806         def remove_duplicates(lst, keyfn, compare):
807             els = {}
808             for e in lst:
809                 eif = keyfn(e)
810                 if eif in els:
811                     if compare(els[eif], e) >= 1:
812                         continue
813                 els[eif] = e
814             return els.values()
815
816         # remove fragments with duplicated fid's and duplicated snippets
817         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
818         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
819                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
820
821         # remove duplicate sections
822         sections = {}
823
824         for s in sect:
825             si = s[POSITION][POSITION_INDEX]
826             # skip existing
827             if si in sections:
828                 if sections[si]['score'] >= s[SCORE]:
829                     continue
830
831             m = {'score': s[SCORE],
832                  'section_number': s[POSITION][POSITION_INDEX] + 1,
833                  }
834             m.update(s[OTHER])
835             sections[si] = m
836
837         hits = sections.values()
838
839         for f in frags:
840             try:
841                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
842             except catalogue.models.Fragment.DoesNotExist:
843                 # stale index
844                 continue
845
846             # Figure out if we were searching for a token matching some word in theme name.
847             themes = frag.tags.filter(category='theme')
848             themes_hit = []
849             if self.searched is not None:
850                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
851                 for theme in themes:
852                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
853                     for t in tokens:
854                         if t in name_tokens:
855                             if not theme in themes_hit:
856                                 themes_hit.append(theme)
857                             break
858
859             m = {'score': f[SCORE],
860                  'fragment': frag,
861                  'section_number': f[POSITION][POSITION_INDEX] + 1,
862                  'themes': themes,
863                  'themes_hit': themes_hit
864                  }
865             m.update(f[OTHER])
866             hits.append(m)
867
868         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
869
870         self._processed_hits = hits
871
872         return hits
873
874     def __unicode__(self):
875         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
876
877     @staticmethod
878     def aggregate(*result_lists):
879         books = {}
880         for rl in result_lists:
881             for r in rl:
882                 if r.book_id in books:
883                     books[r.book_id].merge(r)
884                 else:
885                     books[r.book_id] = r
886         return books.values()
887
888     def __cmp__(self, other):
889         c = cmp(self.score, other.score)
890         if c == 0:
891             # this is inverted, because earlier date is better
892             return cmp(other.published_date, self.published_date)
893         else:
894             return c
895
896
897 class Hint(object):
898     """
899     Given some hint information (information we already know about)
900     our search target - like author, title (specific book), epoch, genre, kind
901     we can narrow down search using filters.
902     """
903     def __init__(self, search):
904         """
905         Accepts a Searcher instance.
906         """
907         self.search = search
908         self.book_tags = {}
909         self.part_tags = []
910         self._books = []
911
912     def books(self, *books):
913         """
914         Give a hint that we search these books.
915         """
916         self._books = books
917
918     def tags(self, tags):
919         """
920         Give a hint that these Tag objects (a list of)
921         is necessary.
922         """
923         for t in tags:
924             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
925                 lst = self.book_tags.get(t.category, [])
926                 lst.append(t)
927                 self.book_tags[t.category] = lst
928             if t.category in ['theme', 'theme_pl']:
929                 self.part_tags.append(t)
930
931     def tag_filter(self, tags, field='tags'):
932         """
933         Given a lsit of tags and an optional field (but they are normally in tags field)
934         returns a filter accepting only books with specific tags.
935         """
936         q = BooleanQuery()
937
938         for tag in tags:
939             toks = self.search.get_tokens(tag.name, field=field)
940             tag_phrase = PhraseQuery()
941             for tok in toks:
942                 tag_phrase.add(Term(field, tok))
943             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
944
945         return QueryWrapperFilter(q)
946
947     def book_filter(self):
948         """
949         Filters using book tags (all tag kinds except a theme)
950         """
951         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
952         if tags:
953             return self.tag_filter(tags)
954         else:
955             return None
956
957     def part_filter(self):
958         """
959         This filter can be used to look for book parts.
960         It filters on book id and/or themes.
961         """
962         fs = []
963         if self.part_tags:
964             fs.append(self.tag_filter(self.part_tags, field='themes'))
965
966         if self._books != []:
967             bf = BooleanFilter()
968             for b in self._books:
969                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
970                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
971             fs.append(bf)
972
973         return Search.chain_filters(fs)
974
975     def should_search_for_book(self):
976         return self._books == []
977
978     def just_search_in(self, all):
979         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
980         some = []
981         for field in all:
982             if field == 'authors' and 'author' in self.book_tags:
983                 continue
984             if field == 'title' and self._books != []:
985                 continue
986             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
987                 continue
988             some.append(field)
989         return some
990
991
992 class Search(IndexStore):
993     """
994     Search facilities.
995     """
996     def __init__(self, default_field="content"):
997         IndexStore.__init__(self)
998         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
999         # self.analyzer = WLAnalyzer()
1000         reader = IndexReader.open(self.store, True)
1001         self.searcher = IndexSearcher(reader)
1002         self.parser = QueryParser(Version.LUCENE_34, default_field,
1003                                   self.analyzer)
1004
1005         self.parent_filter = TermsFilter()
1006         self.parent_filter.addTerm(Term("is_book", "true"))
1007         index_changed.connect(self.reopen)
1008
1009     def close(self):
1010         reader = self.searcher.getIndexReader()
1011         self.searcher.close()
1012         reader.close()
1013         super(Search, self).close()
1014         index_changed.disconnect(self.reopen)
1015
1016     def reopen(self, **unused):
1017         reader = self.searcher.getIndexReader()
1018         rdr = reader.reopen()
1019         if not rdr.equals(reader):
1020             log.debug('Reopening index')
1021             oldsearch = self.searcher
1022             self.searcher = IndexSearcher(rdr)
1023             oldsearch.close()
1024             reader.close()
1025
1026     def query(self, query):
1027         """Parse query in default Lucene Syntax. (for humans)
1028         """
1029         return self.parser.parse(query)
1030
1031     def simple_search(self, query, max_results=50):
1032         """Runs a query for books using lucene syntax. (for humans)
1033         Returns (books, total_hits)
1034         """
1035
1036         tops = self.searcher.search(self.query(query), max_results)
1037         bks = []
1038         for found in tops.scoreDocs:
1039             doc = self.searcher.doc(found.doc)
1040             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1041         return (bks, tops.totalHits)
1042
1043     def get_tokens(self, searched, field='content', cached=None):
1044         """returns tokens analyzed by a proper (for a field) analyzer
1045         argument can be: StringReader, string/unicode, or tokens. In the last case
1046         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1047         """
1048         if cached is not None and field in cached:
1049             return cached[field]
1050
1051         if isinstance(searched, str) or isinstance(searched, unicode):
1052             searched = StringReader(searched)
1053         elif isinstance(searched, list):
1054             return searched
1055
1056         searched.reset()
1057         tokens = self.analyzer.reusableTokenStream(field, searched)
1058         toks = []
1059         while tokens.incrementToken():
1060             cta = tokens.getAttribute(CharTermAttribute.class_)
1061             toks.append(cta.toString())
1062
1063         if cached is not None:
1064             cached[field] = toks
1065
1066         return toks
1067
1068     @staticmethod
1069     def fuzziness(self, fuzzy):
1070         """Helper method to sanitize fuzziness"""
1071         if not fuzzy:
1072             return None
1073         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1074             return fuzzy
1075         else:
1076             return 0.5
1077
1078     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1079         """
1080         Return a PhraseQuery with a series of tokens.
1081         """
1082         if fuzzy:
1083             phrase = MultiPhraseQuery()
1084             for t in tokens:
1085                 term = Term(field, t)
1086                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1087                 fuzzterms = []
1088
1089                 while True:
1090                     ft = fuzzterm.term()
1091                     if ft:
1092                         fuzzterms.append(ft)
1093                     if not fuzzterm.next(): break
1094                 if fuzzterms:
1095                     phrase.add(JArray('object')(fuzzterms, Term))
1096                 else:
1097                     phrase.add(term)
1098         else:
1099             phrase = PhraseQuery()
1100             phrase.setSlop(slop)
1101             for t in tokens:
1102                 term = Term(field, t)
1103                 phrase.add(term)
1104         return phrase
1105
1106     @staticmethod
1107     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1108         """
1109         Returns term queries joined by boolean query.
1110         modal - applies to boolean query
1111         fuzzy - should the query by fuzzy.
1112         """
1113         q = BooleanQuery()
1114         for t in tokens:
1115             term = Term(field, t)
1116             if fuzzy:
1117                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1118             else:
1119                 term = TermQuery(term)
1120             q.add(BooleanClause(term, modal))
1121         return q
1122
1123     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1124                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1125         if filters is None: filters = []
1126         if tokens_cache is None: tokens_cache = {}
1127
1128         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1129
1130         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1131         if book:
1132             filters.append(self.term_filter(Term('is_book', 'true')))
1133         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1134
1135         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1136
1137     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1138                     filters=None, tokens_cache=None, boost=None, snippets=True):
1139         if filters is None: filters = []
1140         if tokens_cache is None: tokens_cache = {}
1141
1142         if book:
1143             filters.append(self.term_filter(Term('is_book', 'true')))
1144
1145         query = BooleanQuery()
1146
1147         for fld in fields:
1148             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1149
1150             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1151                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1152
1153         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1154
1155         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1156                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1157
1158     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1159         """
1160         Search for perfect book matches. Just see if the query matches with some author or title,
1161         taking hints into account.
1162         """
1163         fields_to_search = ['authors', 'title']
1164         only_in = None
1165         if hint:
1166             if not hint.should_search_for_book():
1167                 return []
1168             fields_to_search = hint.just_search_in(fields_to_search)
1169             only_in = hint.book_filter()
1170
1171         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1172
1173         books = []
1174         for q in qrys:
1175             top = self.searcher.search(q,
1176                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1177                 max_results)
1178             for found in top.scoreDocs:
1179                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1180         return books
1181
1182     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1183         fields_to_search = ['tags', 'authors', 'title']
1184
1185         only_in = None
1186         if hint:
1187             if not hint.should_search_for_book():
1188                 return []
1189             fields_to_search = hint.just_search_in(fields_to_search)
1190             only_in = hint.book_filter()
1191
1192         tokens = self.get_tokens(searched, field='SIMPLE')
1193
1194         q = BooleanQuery()
1195
1196         for fld in fields_to_search:
1197             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1198                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1199
1200         books = []
1201         top = self.searcher.search(q,
1202                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1203             max_results)
1204         for found in top.scoreDocs:
1205             books.append(SearchResult(self, found, how_found="search_book"))
1206
1207         return books
1208
1209     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1210         """
1211         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1212         some part/fragment of the book.
1213         """
1214         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1215
1216         flt = None
1217         if hint:
1218             flt = hint.part_filter()
1219
1220         books = []
1221         for q in qrys:
1222             top = self.searcher.search(q,
1223                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1224                                                            flt]),
1225                                        max_results)
1226             for found in top.scoreDocs:
1227                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1228
1229         return books
1230
1231     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1232         """
1233         Tries to use search terms to match different fields of book (or its parts).
1234         E.g. one word can be an author survey, another be a part of the title, and the rest
1235         are some words from third chapter.
1236         """
1237         if tokens_cache is None: tokens_cache = {}
1238         books = []
1239         only_in = None
1240
1241         if hint:
1242             only_in = hint.part_filter()
1243
1244         # content only query : themes x content
1245         q = BooleanQuery()
1246
1247         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1248         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1249
1250         # only search in themes when we do not already filter by themes
1251         if hint is None or hint.just_search_in(['themes']) != []:
1252             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1253                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1254
1255         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1256                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1257
1258         topDocs = self.searcher.search(q, only_in, max_results)
1259         for found in topDocs.scoreDocs:
1260             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1261
1262         # query themes/content x author/title/tags
1263         q = BooleanQuery()
1264         in_content = BooleanQuery()
1265         in_meta = BooleanQuery()
1266
1267         for fld in ['themes_pl', 'content']:
1268             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1269
1270         for fld in ['tags', 'authors', 'title']:
1271             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1272
1273         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1274         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1275
1276         topDocs = self.searcher.search(q, only_in, max_results)
1277         for found in topDocs.scoreDocs:
1278             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1279
1280         return books
1281
1282     # def multisearch(self, query, max_results=50):
1283     #     """
1284     #     Search strategy:
1285     #     - (phrase) OR -> content
1286     #                   -> title
1287     #                   -> authors
1288     #     - (keywords)  -> authors
1289     #                   -> motyw
1290     #                   -> tags
1291     #                   -> content
1292     #     """
1293         # queryreader = StringReader(query)
1294         # tokens = self.get_tokens(queryreader)
1295
1296         # top_level = BooleanQuery()
1297         # Should = BooleanClause.Occur.SHOULD
1298
1299         # phrase_level = BooleanQuery()
1300         # phrase_level.setBoost(1.3)
1301
1302         # p_content = self.make_phrase(tokens, joined=True)
1303         # p_title = self.make_phrase(tokens, 'title')
1304         # p_author = self.make_phrase(tokens, 'author')
1305
1306         # phrase_level.add(BooleanClause(p_content, Should))
1307         # phrase_level.add(BooleanClause(p_title, Should))
1308         # phrase_level.add(BooleanClause(p_author, Should))
1309
1310         # kw_level = BooleanQuery()
1311
1312         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1313         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1314         # kw_level.add(j_themes, Should)
1315         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1316         # j_con = self.make_term_query(tokens, joined=True)
1317         # kw_level.add(j_con, Should)
1318
1319         # top_level.add(BooleanClause(phrase_level, Should))
1320         # top_level.add(BooleanClause(kw_level, Should))
1321
1322         # return None
1323
1324     def get_snippets(self, scoreDoc, query, field='content'):
1325         """
1326         Returns a snippet for found scoreDoc.
1327         """
1328         htmlFormatter = SimpleHTMLFormatter()
1329         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1330
1331         stored = self.searcher.doc(scoreDoc.doc)
1332
1333         position = stored.get('snippets_position')
1334         length = stored.get('snippets_length')
1335         if position is None or length is None:
1336             return None
1337         revision = stored.get('snippets_revision')
1338         if revision: revision = int(revision)
1339
1340         # locate content.
1341         book_id = int(stored.get('book_id'))
1342         snippets = Snippets(book_id, revision=revision)
1343
1344         try:
1345             snippets.open()
1346         except IOError, e:
1347             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1348             return []
1349
1350         try:
1351             try:
1352                 text = snippets.get((int(position),
1353                                      int(length)))
1354             finally:
1355                 snippets.close()
1356
1357             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1358             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1359             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1360
1361         except Exception, e:
1362             e2 = e
1363             if hasattr(e, 'getJavaException'):
1364                 e2 = unicode(e.getJavaException())
1365             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1366                 e2)
1367         return snip
1368
1369     @staticmethod
1370     def enum_to_array(enum):
1371         """
1372         Converts a lucene TermEnum to array of Terms, suitable for
1373         addition to queries
1374         """
1375         terms = []
1376
1377         while True:
1378             t = enum.term()
1379             if t:
1380                 terms.append(t)
1381             if not enum.next(): break
1382
1383         if terms:
1384             return JArray('object')(terms, Term)
1385
1386     def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1387         """
1388         Search for Tag objects using query.
1389         """
1390         if not pdcounter:
1391             filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1392         tops = self.searcher.search(query, filt, max_results)
1393
1394         tags = []
1395         for found in tops.scoreDocs:
1396             doc = self.searcher.doc(found.doc)
1397             is_pdcounter = doc.get('is_pdcounter')
1398             category = doc.get('tag_category')
1399             try:
1400                 if is_pdcounter == 'true':
1401                     if category == 'pd_author':
1402                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1403                     elif category == 'pd_book':
1404                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1405                         tag.category = 'pd_book'  # make it look more lik a tag.
1406                     else:
1407                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1408                 else:
1409                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1410                     # don't add the pdcounter tag if same tag already exists
1411                 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1412                     tags.append(tag)
1413             except catalogue.models.Tag.DoesNotExist: pass
1414             except PDCounterAuthor.DoesNotExist: pass
1415             except PDCounterBook.DoesNotExist: pass
1416
1417         log.debug('search_tags: %s' % tags)
1418
1419         return tags
1420
1421     def search_books(self, query, filt=None, max_results=10):
1422         """
1423         Searches for Book objects using query
1424         """
1425         bks = []
1426         tops = self.searcher.search(query, filt, max_results)
1427         for found in tops.scoreDocs:
1428             doc = self.searcher.doc(found.doc)
1429             try:
1430                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1431             except catalogue.models.Book.DoesNotExist: pass
1432         return bks
1433
1434     def make_prefix_phrase(self, toks, field):
1435         q = MultiPhraseQuery()
1436         for i in range(len(toks)):
1437             t = Term(field, toks[i])
1438             if i == len(toks) - 1:
1439                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1440                 if pterms:
1441                     q.add(pterms)
1442                 else:
1443                     q.add(t)
1444             else:
1445                 q.add(t)
1446         return q
1447
1448     @staticmethod
1449     def term_filter(term, inverse=False):
1450         only_term = TermsFilter()
1451         only_term.addTerm(term)
1452
1453         if inverse:
1454             neg = BooleanFilter()
1455             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1456             only_term = neg
1457
1458         return only_term
1459
1460     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1461         """
1462         Return auto-complete hints for tags
1463         using prefix search.
1464         """
1465         toks = self.get_tokens(string, field='SIMPLE')
1466         top = BooleanQuery()
1467
1468         for field in ['tag_name', 'tag_name_pl']:
1469             if prefix:
1470                 q = self.make_prefix_phrase(toks, field)
1471             else:
1472                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1473             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1474
1475         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1476
1477         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1478
1479     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1480         """
1481         Returns auto-complete hints for book titles
1482         Because we do not index 'pseudo' title-tags.
1483         Prefix search.
1484         """
1485         toks = self.get_tokens(string, field='SIMPLE')
1486
1487         if prefix:
1488             q = self.make_prefix_phrase(toks, 'title')
1489         else:
1490             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1491
1492         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1493
1494     @staticmethod
1495     def chain_filters(filters, op=ChainedFilter.AND):
1496         """
1497         Chains a filter list together
1498         """
1499         filters = filter(lambda x: x is not None, filters)
1500         if not filters or filters is []:
1501             return None
1502         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1503         return chf
1504
1505     def filtered_categories(self, tags):
1506         """
1507         Return a list of tag categories, present in tags list.
1508         """
1509         cats = {}
1510         for t in tags:
1511             cats[t.category] = True
1512         return cats.keys()
1513
1514     def hint(self):
1515         return Hint(self)