librarian bump
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from django.dispatch import Signal
5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
6     File, Field, Integer, \
7     NumericField, Version, Document, JavaError, IndexSearcher, \
8     QueryParser, PerFieldAnalyzerWrapper, \
9     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
10     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
11     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
12     HashSet, BooleanClause, Term, CharTermAttribute, \
13     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
14     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
15     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
16     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
17     initVM, CLASSPATH, JArray, JavaError
18     # KeywordAnalyzer
19
20 # Initialize jvm
21 JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP)
22
23 import sys
24 import os
25 import re
26 import errno
27 from librarian import dcparser
28 from librarian.parser import WLDocument
29 from lxml import etree
30 import catalogue.models
31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
32 from multiprocessing.pool import ThreadPool
33 from threading import current_thread
34 from itertools import chain
35 import atexit
36 import traceback
37 import logging
38 log = logging.getLogger('search')
39
40 class WLAnalyzer(PerFieldAnalyzerWrapper):
41     def __init__(self):
42         polish = PolishAnalyzer(Version.LUCENE_34)
43         #        polish_gap.setPositionIncrementGap(999)
44
45         simple = SimpleAnalyzer(Version.LUCENE_34)
46         #        simple_gap.setPositionIncrementGap(999)
47
48         keyword = KeywordAnalyzer(Version.LUCENE_34)
49
50         # not sure if needed: there's NOT_ANALYZED meaning basically the same
51
52         PerFieldAnalyzerWrapper.__init__(self, polish)
53
54         self.addAnalyzer("tags", simple)
55         self.addAnalyzer("technical_editors", simple)
56         self.addAnalyzer("editors", simple)
57         self.addAnalyzer("url", keyword)
58         self.addAnalyzer("source_url", keyword)
59         self.addAnalyzer("source_name", simple)
60         self.addAnalyzer("publisher", simple)
61         self.addAnalyzer("authors", simple)
62         self.addAnalyzer("title", simple)
63
64         self.addAnalyzer("is_book", keyword)
65         # shouldn't the title have two forms? _pl and simple?
66
67         self.addAnalyzer("themes", simple)
68         self.addAnalyzer("themes_pl", polish)
69
70         self.addAnalyzer("tag_name", simple)
71         self.addAnalyzer("tag_name_pl", polish)
72
73         self.addAnalyzer("translators", simple)
74
75         self.addAnalyzer("KEYWORD", keyword)
76         self.addAnalyzer("SIMPLE", simple)
77         self.addAnalyzer("POLISH", polish)
78
79
80 class IndexStore(object):
81     """
82     Provides access to search index.
83
84     self.store - lucene index directory
85     """
86     def __init__(self):
87         self.make_index_dir()
88         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
89
90     def make_index_dir(self):
91         try:
92             os.makedirs(settings.SEARCH_INDEX)
93         except OSError as exc:
94             if exc.errno == errno.EEXIST:
95                 pass
96             else: raise
97
98     def close(self):
99         self.store.close()
100
101
102 class IndexChecker(IndexStore):
103     def __init__(self):
104         IndexStore.__init__(self)
105
106     def check(self):
107         checker = CheckIndex(self.store)
108         status = checker.checkIndex()
109         return status
110
111
112 class Snippets(object):
113     """
114     This class manages snippet files for indexed object (book)
115     the snippets are concatenated together, and their positions and
116     lengths are kept in lucene index fields.
117     """
118     SNIPPET_DIR = "snippets"
119
120     def __init__(self, book_id, revision=None):
121         try:
122             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
123         except OSError as exc:
124             if exc.errno == errno.EEXIST:
125                 pass
126             else: raise
127         self.book_id = book_id
128         self.revision = revision
129         self.file = None
130
131     @property
132     def path(self):
133         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
134         else: fn = "%d" % self.book_id
135
136         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
137
138     def open(self, mode='r'):
139         """
140         Open the snippet file. Call .close() afterwards.
141         """
142         if not 'b' in mode:
143             mode += 'b'
144
145         if 'w' in mode:
146             if os.path.exists(self.path):
147                 self.revision = 1
148                 while True:
149                     if not os.path.exists(self.path):
150                         break
151                     self.revision += 1
152
153         self.file = open(self.path, mode)
154         self.position = 0
155         return self
156
157     def add(self, snippet):
158         """
159         Append a snippet (unicode) to the snippet file.
160         Return a (position, length) tuple
161         """
162         txt = snippet.encode('utf-8')
163         l = len(txt)
164         self.file.write(txt)
165         pos = (self.position, l)
166         self.position += l
167         return pos
168
169     def get(self, pos):
170         """
171         Given a tuple of (position, length) return an unicode
172         of the snippet stored there.
173         """
174         self.file.seek(pos[0], 0)
175         txt = self.file.read(pos[1]).decode('utf-8')
176         return txt
177
178     def close(self):
179         """Close snippet file"""
180         self.file.close()
181
182     def remove(self):
183         self.revision = None
184         try:
185             os.unlink(self.path)
186             self.revision = 0
187             while True:
188                 self.revision += 1
189                 os.unlink(self.path)
190         except OSError:
191             pass
192
193
194 class BaseIndex(IndexStore):
195     """
196     Base index class.
197     Provides basic operations on index: opening, closing, optimizing.
198     """
199     def __init__(self, analyzer=None):
200         super(BaseIndex, self).__init__()
201         self.index = None
202         if not analyzer:
203             analyzer = WLAnalyzer()
204         self.analyzer = analyzer
205
206     def open(self, timeout=None):
207         if self.index:
208             raise Exception("Index is already opened")
209         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
210         if timeout:
211             conf.setWriteLockTimeout(long(timeout))
212         self.index = IndexWriter(self.store, conf)
213         return self.index
214
215     def optimize(self):
216         self.index.optimize()
217
218     def close(self):
219         try:
220             self.index.optimize()
221         except JavaError, je:
222             log.error("Error during optimize phase, check index: %s" % je)
223
224         self.index.close()
225         self.index = None
226
227         index_changed.send_robust(self)
228
229         super(BaseIndex, self).close()
230
231     def __enter__(self):
232         self.open()
233         return self
234
235     def __exit__(self, type, value, tb):
236         self.close()
237
238
239 index_changed = Signal()
240
241
242 class Index(BaseIndex):
243     """
244     Class indexing books.
245     """
246     def __init__(self, analyzer=None):
247         super(Index, self).__init__(analyzer)
248
249     def index_tags(self, *tags, **kw):
250         """
251         Re-index global tag list.
252         Removes all tags from index, then index them again.
253         Indexed fields include: id, name (with and without polish stems), category
254         """
255         remove_only = kw.get('remove_only', False)
256         # first, remove tags from index.
257         if tags:
258             q = BooleanQuery()
259             for tag in tags:
260                 b_id_cat = BooleanQuery()
261
262                 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
263                 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
264
265                 if isinstance(tag, PDCounterAuthor):
266                     q_cat = TermQuery(Term('tag_category', 'pd_author'))
267                 elif isinstance(tag, PDCounterBook):
268                     q_cat = TermQuery(Term('tag_category', 'pd_book'))
269                 else:
270                     q_cat = TermQuery(Term('tag_category', tag.category))
271                 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
272
273                 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
274         else:  # all
275             q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
276             self.index.deleteDocuments(q)
277
278         if not remove_only:
279             # then add them [all or just one passed]
280             if not tags:
281                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
282                     PDCounterAuthor.objects.all(), \
283                     PDCounterBook.objects.all())
284
285             for tag in tags:
286                 if isinstance(tag, PDCounterAuthor):
287                     doc = Document()
288                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
289                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
290                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
291                     doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
292                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
293                     self.index.addDocument(doc)
294                 elif isinstance(tag, PDCounterBook):
295                     doc = Document()
296                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
297                     doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
298                     doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
299                     doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
300                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
301                     self.index.addDocument(doc)
302                 else:
303                     doc = Document()
304                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
305                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
306                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
307                     doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
308                     self.index.addDocument(doc)
309
310     def create_book_doc(self, book):
311         """
312         Create a lucene document referring book id.
313         """
314         doc = Document()
315         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
316         if book.parent is not None:
317             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
318         return doc
319
320     def remove_book(self, book_or_id, remove_snippets=True):
321         """Removes a book from search index.
322         book - Book instance."""
323         if isinstance(book_or_id, catalogue.models.Book):
324             book_id = book_or_id.id
325         else:
326             book_id = book_or_id
327
328         q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
329         self.index.deleteDocuments(q)
330
331         if remove_snippets:
332             snippets = Snippets(book_id)
333             snippets.remove()
334
335     def index_book(self, book, book_info=None, overwrite=True):
336         """
337         Indexes the book.
338         Creates a lucene document for extracted metadata
339         and calls self.index_content() to index the contents of the book.
340         """
341         if overwrite:
342             # we don't remove snippets, since they might be still needed by
343             # threads using not reopened index
344             self.remove_book(book, remove_snippets=False)
345
346         book_doc = self.create_book_doc(book)
347         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
348         # let's not index it - it's only used for extracting publish date
349         if 'source_name' in meta_fields:
350             del meta_fields['source_name']
351         
352         for f in meta_fields.values():
353             if isinstance(f, list) or isinstance(f, tuple):
354                 for elem in f:
355                     book_doc.add(elem)
356             else:
357                 book_doc.add(f)
358         self.index.addDocument(book_doc)
359         del book_doc
360
361         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
362
363     master_tags = [
364         'opowiadanie',
365         'powiesc',
366         'dramat_wierszowany_l',
367         'dramat_wierszowany_lp',
368         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
369         'wywiad',
370         ]
371
372     ignore_content_tags = [
373         'uwaga', 'extra',
374         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
375         'didaskalia',
376         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
377         ]
378
379     footnote_tags = ['pa', 'pt', 'pr', 'pe']
380
381     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
382
383     published_date_re = re.compile("([0-9]+)[\]. ]*$")
384
385     def extract_metadata(self, book, book_info=None, dc_only=None):
386         """
387         Extract metadata from book and returns a map of fields keyed by fieldname
388         """
389         fields = {}
390
391         if book_info is None:
392             book_info = dcparser.parse(open(book.xml_file.path))
393
394         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
395         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
396         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
397
398         # validator, name
399         for field in dcparser.BookInfo.FIELDS:
400             if dc_only and field.name not in dc_only:
401                 continue
402             if hasattr(book_info, field.name):
403                 if not getattr(book_info, field.name):
404                     continue
405                 # since no type information is available, we use validator
406                 type_indicator = field.validator
407                 if type_indicator == dcparser.as_unicode:
408                     s = getattr(book_info, field.name)
409                     if field.multiple:
410                         s = ', '.join(s)
411                     try:
412                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
413                     except JavaError as je:
414                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
415                 elif type_indicator == dcparser.as_person:
416                     p = getattr(book_info, field.name)
417                     if isinstance(p, dcparser.Person):
418                         persons = unicode(p)
419                     else:
420                         persons = ', '.join(map(unicode, p))
421                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
422                 elif type_indicator == dcparser.as_date:
423                     dt = getattr(book_info, field.name)
424                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
425                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
426
427         # get published date
428         pd = None
429         if hasattr(book_info, 'source_name') and book_info.source_name:
430             match = self.published_date_re.search(book_info.source_name)
431             if match is not None:
432                 pd = str(match.groups()[0])
433         if not pd: pd = ""
434         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
435
436         return fields
437
438     def add_gaps(self, fields, fieldname):
439         """
440         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
441         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
442         """
443         def gap():
444             while True:
445                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
446         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
447
448     def get_master(self, root):
449         """
450         Returns the first master tag from an etree.
451         """
452         for master in root.iter():
453             if master.tag in self.master_tags:
454                 return master
455
456     def index_content(self, book, book_fields=[]):
457         """
458         Walks the book XML and extract content from it.
459         Adds parts for each header tag and for each fragment.
460         """
461         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
462         root = wld.edoc.getroot()
463
464         master = self.get_master(root)
465         if master is None:
466             return []
467
468         def walker(node, ignore_tags=[]):
469
470             if node.tag not in ignore_tags:
471                 yield node, None, None
472                 if node.text is not None:
473                     yield None, node.text, None
474                 for child in list(node):
475                     for b, t, e in walker(child):
476                         yield b, t, e
477                 yield None, None, node
478
479             if node.tail is not None:
480                 yield None, node.tail, None
481             return
482
483         def fix_format(text):
484             #            separator = [u" ", u"\t", u".", u";", u","]
485             if isinstance(text, list):
486                 # need to join it first
487                 text = filter(lambda s: s is not None, content)
488                 text = u' '.join(text)
489                 # for i in range(len(text)):
490                 #     if i > 0:
491                 #         if text[i][0] not in separator\
492                 #             and text[i - 1][-1] not in separator:
493                 #          text.insert(i, u" ")
494
495             return re.sub("(?m)/$", "", text)
496
497         def add_part(snippets, **fields):
498             doc = self.create_book_doc(book)
499             for f in book_fields:
500                 doc.add(f)
501
502             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
503             doc.add(NumericField("header_span", Field.Store.YES, True)\
504                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
505             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
506
507             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
508                           Field.TermVector.WITH_POSITIONS_OFFSETS))
509
510             snip_pos = snippets.add(fields["content"])
511             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
512             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
513             if snippets.revision:
514                 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
515
516             if 'fragment_anchor' in fields:
517                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
518                               Field.Store.YES, Field.Index.NOT_ANALYZED))
519
520             if 'themes' in fields:
521                 themes, themes_pl = zip(*[
522                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
523                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
524                      for theme in fields['themes']])
525
526                 themes = self.add_gaps(themes, 'themes')
527                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
528
529                 for t in themes:
530                     doc.add(t)
531                 for t in themes_pl:
532                     doc.add(t)
533
534             return doc
535
536         def give_me_utf8(s):
537             if isinstance(s, unicode):
538                 return s.encode('utf-8')
539             else:
540                 return s
541
542         fragments = {}
543         snippets = Snippets(book.id).open('w')
544         try:
545             for header, position in zip(list(master), range(len(master))):
546
547                 if header.tag in self.skip_header_tags:
548                     continue
549                 if header.tag is etree.Comment:
550                     continue
551
552                 # section content
553                 content = []
554                 footnote = []
555
556                 def all_content(text):
557                     for frag in fragments.values():
558                         frag['content'].append(text)
559                     content.append(text)
560                 handle_text = [all_content]
561
562
563                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
564                     # handle footnotes
565                     if start is not None and start.tag in self.footnote_tags:
566                         footnote = []
567                         def collect_footnote(t):
568                             footnote.append(t)
569                         handle_text.append(collect_footnote)
570                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
571                         handle_text.pop()
572                         doc = add_part(snippets, header_index=position, header_type=header.tag,
573                                        content=u''.join(footnote),
574                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
575                 
576                         self.index.addDocument(doc)
577                         #print "@ footnote text: %s" % footnote
578                         footnote = []
579                     
580                     # handle fragments and themes.
581                     if start is not None and start.tag == 'begin':
582                         fid = start.attrib['id'][1:]
583                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
584
585                     # themes for this fragment
586                     elif start is not None and start.tag == 'motyw':
587                         fid = start.attrib['id'][1:]
588                         handle_text.append(None)
589                         if start.text is not None:
590                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
591                     elif end is not None and end.tag == 'motyw':
592                         handle_text.pop()
593
594                     elif start is not None and start.tag == 'end':
595                         fid = start.attrib['id'][1:]
596                         if fid not in fragments:
597                             continue  # a broken <end> node, skip it
598                         frag = fragments[fid]
599                         if frag['themes'] == []:
600                             continue  # empty themes list.
601                         del fragments[fid]
602
603                         doc = add_part(snippets,
604                                        header_type=frag['start_header'],
605                                        header_index=frag['start_section'],
606                                        header_span=position - frag['start_section'] + 1,
607                                        fragment_anchor=fid,
608                                        content=fix_format(frag['content']),
609                                        themes=frag['themes'])
610                         #print '@ FRAG %s' % frag['content']
611                         self.index.addDocument(doc)
612
613                         # Collect content.
614
615                     if text is not None and handle_text is not []:
616                         hdl = handle_text[-1]
617                         if hdl is not None:
618                             hdl(text)
619
620                         # in the end, add a section text.
621                 doc = add_part(snippets, header_index=position, header_type=header.tag,
622                                content=fix_format(content))
623                 #print '@ CONTENT: %s' % fix_format(content)
624
625                 self.index.addDocument(doc)
626
627         finally:
628             snippets.close()
629
630
631 def log_exception_wrapper(f):
632     def _wrap(*a):
633         try:
634             f(*a)
635         except Exception, e:
636             log.error("Error in indexing thread: %s" % e)
637             traceback.print_exc()
638             raise e
639     return _wrap
640
641
642 class ReusableIndex(Index):
643     """
644     Works like index, but does not close/optimize Lucene index
645     until program exit (uses atexit hook).
646     This is usefull for importbooks command.
647
648     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
649     """
650     index = None
651
652     def open(self, analyzer=None, **kw):
653         if ReusableIndex.index:
654             self.index = ReusableIndex.index
655         else:
656             Index.open(self, analyzer, **kw)
657             ReusableIndex.index = self.index
658             atexit.register(ReusableIndex.close_reusable)
659
660     # def index_book(self, *args, **kw):
661     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
662     #     ReusableIndex.pool_jobs.append(job)
663
664     @staticmethod
665     def close_reusable():
666         if ReusableIndex.index:
667             ReusableIndex.index.optimize()
668             ReusableIndex.index.close()
669             ReusableIndex.index = None
670
671             index_changed.send_robust(None)
672
673     def close(self):
674         if ReusableIndex.index:
675             ReusableIndex.index.commit()
676
677
678 class JoinSearch(object):
679     """
680     This mixin could be used to handle block join queries.
681     (currently unused)
682     """
683     def __init__(self, *args, **kw):
684         super(JoinSearch, self).__init__(*args, **kw)
685
686     def wrapjoins(self, query, fields=[]):
687         """
688         This functions modifies the query in a recursive way,
689         so Term and Phrase Queries contained, which match
690         provided fields are wrapped in a BlockJoinQuery,
691         and so delegated to children documents.
692         """
693         if BooleanQuery.instance_(query):
694             qs = BooleanQuery.cast_(query)
695             for clause in qs:
696                 clause = BooleanClause.cast_(clause)
697                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
698             return qs
699         else:
700             termset = HashSet()
701             query.extractTerms(termset)
702             for t in termset:
703                 t = Term.cast_(t)
704                 if t.field() not in fields:
705                     return query
706             return BlockJoinQuery(query, self.parent_filter,
707                                   BlockJoinQuery.ScoreMode.Total)
708
709     def bsearch(self, query, max_results=50):
710         q = self.query(query)
711         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
712
713         tops = self.searcher.search(bjq, max_results)
714         bks = []
715         for found in tops.scoreDocs:
716             doc = self.searcher.doc(found.doc)
717             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
718         return (bks, tops.totalHits)
719
720
721 class SearchResult(object):
722     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
723         if tokens_cache is None: tokens_cache = {}
724
725         if score:
726             self._score = score
727         else:
728             self._score = scoreDocs.score
729
730         self.boost = 1.0
731
732         self._hits = []
733         self._processed_hits = None  # processed hits
734
735         stored = search.searcher.doc(scoreDocs.doc)
736         self.book_id = int(stored.get("book_id"))
737
738         pd = stored.get("published_date")
739         try:
740             self.published_date = int(pd)
741         except ValueError:
742             self.published_date = 0
743
744         header_type = stored.get("header_type")
745         # we have a content hit in some header of fragment
746         if header_type is not None:
747             sec = (header_type, int(stored.get("header_index")))
748             header_span = stored.get('header_span')
749             header_span = header_span is not None and int(header_span) or 1
750
751             fragment = stored.get("fragment_anchor")
752
753             if snippets:
754                 snippets = snippets.replace("/\n", "\n")
755             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
756
757             self._hits.append(hit)
758
759         self.search = search
760         self.searched = searched
761         self.tokens_cache = tokens_cache
762
763     @property
764     def score(self):
765         return self._score * self.boost
766
767     def merge(self, other):
768         if self.book_id != other.book_id:
769             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
770         self._hits += other._hits
771         if other.score > self.score:
772             self._score = other._score
773         return self
774
775     def get_book(self):
776         if hasattr(self, '_book'):
777             return self._book
778         return catalogue.models.Book.objects.get(id=self.book_id)
779
780     book = property(get_book)
781
782     @property
783     def hits(self):
784         if self._processed_hits is not None:
785             return self._processed_hits
786
787         POSITION = 0
788         FRAGMENT = 1
789         POSITION_INDEX = 1
790         POSITION_SPAN = 2
791         SCORE = 2
792         OTHER = 3
793
794         # to sections and fragments
795         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
796
797         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
798
799         # sections not covered by fragments
800         sect = filter(lambda s: 0 == len(filter(
801             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
802             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
803             frags)), sect)
804
805         hits = []
806
807         def remove_duplicates(lst, keyfn, compare):
808             els = {}
809             for e in lst:
810                 eif = keyfn(e)
811                 if eif in els:
812                     if compare(els[eif], e) >= 1:
813                         continue
814                 els[eif] = e
815             return els.values()
816
817         # remove fragments with duplicated fid's and duplicated snippets
818         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
819         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
820                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
821
822         # remove duplicate sections
823         sections = {}
824
825         for s in sect:
826             si = s[POSITION][POSITION_INDEX]
827             # skip existing
828             if si in sections:
829                 if sections[si]['score'] >= s[SCORE]:
830                     continue
831
832             m = {'score': s[SCORE],
833                  'section_number': s[POSITION][POSITION_INDEX] + 1,
834                  }
835             m.update(s[OTHER])
836             sections[si] = m
837
838         hits = sections.values()
839
840         for f in frags:
841             try:
842                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
843             except catalogue.models.Fragment.DoesNotExist:
844                 # stale index
845                 continue
846
847             # Figure out if we were searching for a token matching some word in theme name.
848             themes = frag.tags.filter(category='theme')
849             themes_hit = []
850             if self.searched is not None:
851                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
852                 for theme in themes:
853                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
854                     for t in tokens:
855                         if t in name_tokens:
856                             if not theme in themes_hit:
857                                 themes_hit.append(theme)
858                             break
859
860             m = {'score': f[SCORE],
861                  'fragment': frag,
862                  'section_number': f[POSITION][POSITION_INDEX] + 1,
863                  'themes': themes,
864                  'themes_hit': themes_hit
865                  }
866             m.update(f[OTHER])
867             hits.append(m)
868
869         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
870
871         self._processed_hits = hits
872
873         return hits
874
875     def __unicode__(self):
876         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
877
878     @staticmethod
879     def aggregate(*result_lists):
880         books = {}
881         for rl in result_lists:
882             for r in rl:
883                 if r.book_id in books:
884                     books[r.book_id].merge(r)
885                 else:
886                     books[r.book_id] = r
887         return books.values()
888
889     def __cmp__(self, other):
890         c = cmp(self.score, other.score)
891         if c == 0:
892             # this is inverted, because earlier date is better
893             return cmp(other.published_date, self.published_date)
894         else:
895             return c
896
897
898 class Hint(object):
899     """
900     Given some hint information (information we already know about)
901     our search target - like author, title (specific book), epoch, genre, kind
902     we can narrow down search using filters.
903     """
904     def __init__(self, search):
905         """
906         Accepts a Searcher instance.
907         """
908         self.search = search
909         self.book_tags = {}
910         self.part_tags = []
911         self._books = []
912
913     def books(self, *books):
914         """
915         Give a hint that we search these books.
916         """
917         self._books = books
918
919     def tags(self, tags):
920         """
921         Give a hint that these Tag objects (a list of)
922         is necessary.
923         """
924         for t in tags:
925             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
926                 lst = self.book_tags.get(t.category, [])
927                 lst.append(t)
928                 self.book_tags[t.category] = lst
929             if t.category in ['theme', 'theme_pl']:
930                 self.part_tags.append(t)
931
932     def tag_filter(self, tags, field='tags'):
933         """
934         Given a lsit of tags and an optional field (but they are normally in tags field)
935         returns a filter accepting only books with specific tags.
936         """
937         q = BooleanQuery()
938
939         for tag in tags:
940             toks = self.search.get_tokens(tag.name, field=field)
941             tag_phrase = PhraseQuery()
942             for tok in toks:
943                 tag_phrase.add(Term(field, tok))
944             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
945
946         return QueryWrapperFilter(q)
947
948     def book_filter(self):
949         """
950         Filters using book tags (all tag kinds except a theme)
951         """
952         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
953         if tags:
954             return self.tag_filter(tags)
955         else:
956             return None
957
958     def part_filter(self):
959         """
960         This filter can be used to look for book parts.
961         It filters on book id and/or themes.
962         """
963         fs = []
964         if self.part_tags:
965             fs.append(self.tag_filter(self.part_tags, field='themes'))
966
967         if self._books != []:
968             bf = BooleanFilter()
969             for b in self._books:
970                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
971                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
972             fs.append(bf)
973
974         return Search.chain_filters(fs)
975
976     def should_search_for_book(self):
977         return self._books == []
978
979     def just_search_in(self, all):
980         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
981         some = []
982         for field in all:
983             if field == 'authors' and 'author' in self.book_tags:
984                 continue
985             if field == 'title' and self._books != []:
986                 continue
987             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
988                 continue
989             some.append(field)
990         return some
991
992
993 class Search(IndexStore):
994     """
995     Search facilities.
996     """
997     def __init__(self, default_field="content"):
998         IndexStore.__init__(self)
999         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
1000         # self.analyzer = WLAnalyzer()
1001         reader = IndexReader.open(self.store, True)
1002         self.searcher = IndexSearcher(reader)
1003         self.parser = QueryParser(Version.LUCENE_34, default_field,
1004                                   self.analyzer)
1005
1006         self.parent_filter = TermsFilter()
1007         self.parent_filter.addTerm(Term("is_book", "true"))
1008         index_changed.connect(self.reopen)
1009
1010     def close(self):
1011         reader = self.searcher.getIndexReader()
1012         self.searcher.close()
1013         reader.close()
1014         super(Search, self).close()
1015         index_changed.disconnect(self.reopen)
1016
1017     def reopen(self, **unused):
1018         reader = self.searcher.getIndexReader()
1019         rdr = reader.reopen()
1020         if not rdr.equals(reader):
1021             log.debug('Reopening index')
1022             oldsearch = self.searcher
1023             self.searcher = IndexSearcher(rdr)
1024             oldsearch.close()
1025             reader.close()
1026
1027     def query(self, query):
1028         """Parse query in default Lucene Syntax. (for humans)
1029         """
1030         return self.parser.parse(query)
1031
1032     def simple_search(self, query, max_results=50):
1033         """Runs a query for books using lucene syntax. (for humans)
1034         Returns (books, total_hits)
1035         """
1036
1037         tops = self.searcher.search(self.query(query), max_results)
1038         bks = []
1039         for found in tops.scoreDocs:
1040             doc = self.searcher.doc(found.doc)
1041             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1042         return (bks, tops.totalHits)
1043
1044     def get_tokens(self, searched, field='content', cached=None):
1045         """returns tokens analyzed by a proper (for a field) analyzer
1046         argument can be: StringReader, string/unicode, or tokens. In the last case
1047         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1048         """
1049         if cached is not None and field in cached:
1050             return cached[field]
1051
1052         if isinstance(searched, str) or isinstance(searched, unicode):
1053             searched = StringReader(searched)
1054         elif isinstance(searched, list):
1055             return searched
1056
1057         searched.reset()
1058         tokens = self.analyzer.reusableTokenStream(field, searched)
1059         toks = []
1060         while tokens.incrementToken():
1061             cta = tokens.getAttribute(CharTermAttribute.class_)
1062             toks.append(cta.toString())
1063
1064         if cached is not None:
1065             cached[field] = toks
1066
1067         return toks
1068
1069     @staticmethod
1070     def fuzziness(fuzzy):
1071         """Helper method to sanitize fuzziness"""
1072         if not fuzzy:
1073             return None
1074         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1075             return fuzzy
1076         else:
1077             return 0.5
1078
1079     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1080         """
1081         Return a PhraseQuery with a series of tokens.
1082         """
1083         if fuzzy:
1084             phrase = MultiPhraseQuery()
1085             for t in tokens:
1086                 term = Term(field, t)
1087                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1088                 fuzzterms = []
1089
1090                 while True:
1091                     ft = fuzzterm.term()
1092                     if ft:
1093                         fuzzterms.append(ft)
1094                     if not fuzzterm.next(): break
1095                 if fuzzterms:
1096                     phrase.add(JArray('object')(fuzzterms, Term))
1097                 else:
1098                     phrase.add(term)
1099         else:
1100             phrase = PhraseQuery()
1101             phrase.setSlop(slop)
1102             for t in tokens:
1103                 term = Term(field, t)
1104                 phrase.add(term)
1105         return phrase
1106
1107     @staticmethod
1108     def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1109         """
1110         Returns term queries joined by boolean query.
1111         modal - applies to boolean query
1112         fuzzy - should the query by fuzzy.
1113         """
1114         q = BooleanQuery()
1115         for t in tokens:
1116             term = Term(field, t)
1117             if fuzzy:
1118                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1119             else:
1120                 term = TermQuery(term)
1121             q.add(BooleanClause(term, modal))
1122         return q
1123
1124     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1125                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1126         if filters is None: filters = []
1127         if tokens_cache is None: tokens_cache = {}
1128
1129         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1130
1131         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1132         if book:
1133             filters.append(self.term_filter(Term('is_book', 'true')))
1134         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1135
1136         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1137
1138     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1139                     filters=None, tokens_cache=None, boost=None, snippets=True):
1140         if filters is None: filters = []
1141         if tokens_cache is None: tokens_cache = {}
1142
1143         if book:
1144             filters.append(self.term_filter(Term('is_book', 'true')))
1145
1146         query = BooleanQuery()
1147
1148         for fld in fields:
1149             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1150
1151             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1152                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1153
1154         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1155
1156         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1157                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1158
1159     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1160         """
1161         Search for perfect book matches. Just see if the query matches with some author or title,
1162         taking hints into account.
1163         """
1164         fields_to_search = ['authors', 'title']
1165         only_in = None
1166         if hint:
1167             if not hint.should_search_for_book():
1168                 return []
1169             fields_to_search = hint.just_search_in(fields_to_search)
1170             only_in = hint.book_filter()
1171
1172         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1173
1174         books = []
1175         for q in qrys:
1176             top = self.searcher.search(q,
1177                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1178                 max_results)
1179             for found in top.scoreDocs:
1180                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1181         return books
1182
1183     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1184         fields_to_search = ['tags', 'authors', 'title']
1185
1186         only_in = None
1187         if hint:
1188             if not hint.should_search_for_book():
1189                 return []
1190             fields_to_search = hint.just_search_in(fields_to_search)
1191             only_in = hint.book_filter()
1192
1193         tokens = self.get_tokens(searched, field='SIMPLE')
1194
1195         q = BooleanQuery()
1196
1197         for fld in fields_to_search:
1198             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1199                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1200
1201         books = []
1202         top = self.searcher.search(q,
1203                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1204             max_results)
1205         for found in top.scoreDocs:
1206             books.append(SearchResult(self, found, how_found="search_book"))
1207
1208         return books
1209
1210     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1211         """
1212         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1213         some part/fragment of the book.
1214         """
1215         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1216
1217         flt = None
1218         if hint:
1219             flt = hint.part_filter()
1220
1221         books = []
1222         for q in qrys:
1223             top = self.searcher.search(q,
1224                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1225                                                            flt]),
1226                                        max_results)
1227             for found in top.scoreDocs:
1228                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1229
1230         return books
1231
1232     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1233         """
1234         Tries to use search terms to match different fields of book (or its parts).
1235         E.g. one word can be an author survey, another be a part of the title, and the rest
1236         are some words from third chapter.
1237         """
1238         if tokens_cache is None: tokens_cache = {}
1239         books = []
1240         only_in = None
1241
1242         if hint:
1243             only_in = hint.part_filter()
1244
1245         # content only query : themes x content
1246         q = BooleanQuery()
1247
1248         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1249         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1250
1251         # only search in themes when we do not already filter by themes
1252         if hint is None or hint.just_search_in(['themes']) != []:
1253             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1254                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1255
1256         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1257                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1258
1259         topDocs = self.searcher.search(q, only_in, max_results)
1260         for found in topDocs.scoreDocs:
1261             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1262
1263         # query themes/content x author/title/tags
1264         q = BooleanQuery()
1265         in_content = BooleanQuery()
1266         in_meta = BooleanQuery()
1267
1268         for fld in ['themes_pl', 'content']:
1269             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1270
1271         for fld in ['tags', 'authors', 'title']:
1272             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1273
1274         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1275         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1276
1277         topDocs = self.searcher.search(q, only_in, max_results)
1278         for found in topDocs.scoreDocs:
1279             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1280
1281         return books
1282
1283     # def multisearch(self, query, max_results=50):
1284     #     """
1285     #     Search strategy:
1286     #     - (phrase) OR -> content
1287     #                   -> title
1288     #                   -> authors
1289     #     - (keywords)  -> authors
1290     #                   -> motyw
1291     #                   -> tags
1292     #                   -> content
1293     #     """
1294         # queryreader = StringReader(query)
1295         # tokens = self.get_tokens(queryreader)
1296
1297         # top_level = BooleanQuery()
1298         # Should = BooleanClause.Occur.SHOULD
1299
1300         # phrase_level = BooleanQuery()
1301         # phrase_level.setBoost(1.3)
1302
1303         # p_content = self.make_phrase(tokens, joined=True)
1304         # p_title = self.make_phrase(tokens, 'title')
1305         # p_author = self.make_phrase(tokens, 'author')
1306
1307         # phrase_level.add(BooleanClause(p_content, Should))
1308         # phrase_level.add(BooleanClause(p_title, Should))
1309         # phrase_level.add(BooleanClause(p_author, Should))
1310
1311         # kw_level = BooleanQuery()
1312
1313         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1314         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1315         # kw_level.add(j_themes, Should)
1316         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1317         # j_con = self.make_term_query(tokens, joined=True)
1318         # kw_level.add(j_con, Should)
1319
1320         # top_level.add(BooleanClause(phrase_level, Should))
1321         # top_level.add(BooleanClause(kw_level, Should))
1322
1323         # return None
1324
1325     def get_snippets(self, scoreDoc, query, field='content'):
1326         """
1327         Returns a snippet for found scoreDoc.
1328         """
1329         htmlFormatter = SimpleHTMLFormatter()
1330         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1331
1332         stored = self.searcher.doc(scoreDoc.doc)
1333
1334         position = stored.get('snippets_position')
1335         length = stored.get('snippets_length')
1336         if position is None or length is None:
1337             return None
1338         revision = stored.get('snippets_revision')
1339         if revision: revision = int(revision)
1340
1341         # locate content.
1342         book_id = int(stored.get('book_id'))
1343         snippets = Snippets(book_id, revision=revision)
1344
1345         try:
1346             snippets.open()
1347         except IOError, e:
1348             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1349             return []
1350
1351         try:
1352             try:
1353                 text = snippets.get((int(position),
1354                                      int(length)))
1355             finally:
1356                 snippets.close()
1357
1358             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1359             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1360             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1361
1362         except Exception, e:
1363             e2 = e
1364             if hasattr(e, 'getJavaException'):
1365                 e2 = unicode(e.getJavaException())
1366             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1367                 e2)
1368         return snip
1369
1370     @staticmethod
1371     def enum_to_array(enum):
1372         """
1373         Converts a lucene TermEnum to array of Terms, suitable for
1374         addition to queries
1375         """
1376         terms = []
1377
1378         while True:
1379             t = enum.term()
1380             if t:
1381                 terms.append(t)
1382             if not enum.next(): break
1383
1384         if terms:
1385             return JArray('object')(terms, Term)
1386
1387     def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1388         """
1389         Search for Tag objects using query.
1390         """
1391         if not pdcounter:
1392             filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1393         tops = self.searcher.search(query, filt, max_results)
1394
1395         tags = []
1396         for found in tops.scoreDocs:
1397             doc = self.searcher.doc(found.doc)
1398             is_pdcounter = doc.get('is_pdcounter')
1399             category = doc.get('tag_category')
1400             try:
1401                 if is_pdcounter == 'true':
1402                     if category == 'pd_author':
1403                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1404                     elif category == 'pd_book':
1405                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1406                         tag.category = 'pd_book'  # make it look more lik a tag.
1407                     else:
1408                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1409                 else:
1410                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1411                     # don't add the pdcounter tag if same tag already exists
1412
1413                 tags.append(tag)
1414
1415             except catalogue.models.Tag.DoesNotExist: pass
1416             except PDCounterAuthor.DoesNotExist: pass
1417             except PDCounterBook.DoesNotExist: pass
1418
1419         log.debug('search_tags: %s' % tags)
1420
1421         return tags
1422
1423     def search_books(self, query, filt=None, max_results=10):
1424         """
1425         Searches for Book objects using query
1426         """
1427         bks = []
1428         tops = self.searcher.search(query, filt, max_results)
1429         for found in tops.scoreDocs:
1430             doc = self.searcher.doc(found.doc)
1431             try:
1432                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1433             except catalogue.models.Book.DoesNotExist: pass
1434         return bks
1435
1436     def make_prefix_phrase(self, toks, field):
1437         q = MultiPhraseQuery()
1438         for i in range(len(toks)):
1439             t = Term(field, toks[i])
1440             if i == len(toks) - 1:
1441                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1442                 if pterms:
1443                     q.add(pterms)
1444                 else:
1445                     q.add(t)
1446             else:
1447                 q.add(t)
1448         return q
1449
1450     @staticmethod
1451     def term_filter(term, inverse=False):
1452         only_term = TermsFilter()
1453         only_term.addTerm(term)
1454
1455         if inverse:
1456             neg = BooleanFilter()
1457             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1458             only_term = neg
1459
1460         return only_term
1461
1462     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1463         """
1464         Return auto-complete hints for tags
1465         using prefix search.
1466         """
1467         toks = self.get_tokens(string, field='SIMPLE')
1468         top = BooleanQuery()
1469
1470         for field in ['tag_name', 'tag_name_pl']:
1471             if prefix:
1472                 q = self.make_prefix_phrase(toks, field)
1473             else:
1474                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1475             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1476
1477         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1478
1479         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1480
1481     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1482         """
1483         Returns auto-complete hints for book titles
1484         Because we do not index 'pseudo' title-tags.
1485         Prefix search.
1486         """
1487         toks = self.get_tokens(string, field='SIMPLE')
1488
1489         if prefix:
1490             q = self.make_prefix_phrase(toks, 'title')
1491         else:
1492             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1493
1494         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1495
1496     @staticmethod
1497     def chain_filters(filters, op=ChainedFilter.AND):
1498         """
1499         Chains a filter list together
1500         """
1501         filters = filter(lambda x: x is not None, filters)
1502         if not filters or filters is []:
1503             return None
1504         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1505         return chf
1506
1507     def filtered_categories(self, tags):
1508         """
1509         Return a list of tag categories, present in tags list.
1510         """
1511         cats = {}
1512         for t in tags:
1513             cats[t.category] = True
1514         return cats.keys()
1515
1516     def hint(self):
1517         return Hint(self)