apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, timeout=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 176         if timeout:
 177             conf.setWriteLockTimeout(long(timeout))
 178         self.index = IndexWriter(self.store, conf)
 179         return self.index
 180
 181     def optimize(self):
 182         self.index.optimize()
 183
 184     def close(self):
 185         try:
 186             self.index.optimize()
 187         except JavaError, je:
 188             print "Error during optimize phase, check index: %s" % je
 189
 190         self.index.close()
 191         self.index = None
 192
 193     def __enter__(self):
 194         self.open()
 195         return self
 196
 197     def __exit__(self, type, value, tb):
 198         self.close()
 199
 200
 201 class Index(BaseIndex):
 202     """
 203     Class indexing books.
 204     """
 205     def __init__(self, analyzer=None):
 206         super(Index, self).__init__(analyzer)
 207
 208     def index_tags(self):
 209         """
 210         Re-index global tag list.
 211         Removes all tags from index, then index them again.
 212         Indexed fields include: id, name (with and without polish stems), category
 213         """
 214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 215         self.index.deleteDocuments(q)
 216
 217         for tag in catalogue.models.Tag.objects.all():
 218             doc = Document()
 219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 223             self.index.addDocument(doc)
 224
 225         for pdtag in PDCounterAuthor.objects.all():
 226             doc = Document()
 227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 230             doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 232             self.index.addDocument(doc)
 233
 234         for pdtag in PDCounterBook.objects.all():
 235             doc = Document()
 236             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 237             doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
 238             doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
 239             doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 240             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 241             self.index.addDocument(doc)
 242
 243     def create_book_doc(self, book):
 244         """
 245         Create a lucene document referring book id.
 246         """
 247         doc = Document()
 248         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 249         if book.parent is not None:
 250             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 251         return doc
 252
 253     def remove_book(self, book):
 254         """Removes a book from search index.
 255         book - Book instance."""
 256         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 257         self.index.deleteDocuments(q)
 258
 259     def index_book(self, book, book_info=None, overwrite=True):
 260         """
 261         Indexes the book.
 262         Creates a lucene document for extracted metadata
 263         and calls self.index_content() to index the contents of the book.
 264         """
 265         if overwrite:
 266             self.remove_book(book)
 267
 268         book_doc = self.create_book_doc(book)
 269         meta_fields = self.extract_metadata(book, book_info)
 270         for f in meta_fields.values():
 271             if isinstance(f, list) or isinstance(f, tuple):
 272                 for elem in f:
 273                     book_doc.add(elem)
 274             else:
 275                 book_doc.add(f)
 276         self.index.addDocument(book_doc)
 277         del book_doc
 278
 279         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 280
 281     master_tags = [
 282         'opowiadanie',
 283         'powiesc',
 284         'dramat_wierszowany_l',
 285         'dramat_wierszowany_lp',
 286         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 287         'wywiad',
 288         ]
 289
 290     ignore_content_tags = [
 291         'uwaga', 'extra',
 292         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 293         'didaskalia',
 294         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 295         ]
 296
 297     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 298
 299     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 300
 301     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 302
 303     def extract_metadata(self, book, book_info=None):
 304         """
 305         Extract metadata from book and returns a map of fields keyed by fieldname
 306         """
 307         fields = {}
 308
 309         if book_info is None:
 310             book_info = dcparser.parse(open(book.xml_file.path))
 311
 312         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 313         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 314         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 315
 316         # validator, name
 317         for field in dcparser.BookInfo.FIELDS:
 318             if hasattr(book_info, field.name):
 319                 if not getattr(book_info, field.name):
 320                     continue
 321                 # since no type information is available, we use validator
 322                 type_indicator = field.validator
 323                 if type_indicator == dcparser.as_unicode:
 324                     s = getattr(book_info, field.name)
 325                     if field.multiple:
 326                         s = ', '.join(s)
 327                     try:
 328                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 329                     except JavaError as je:
 330                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 331                 elif type_indicator == dcparser.as_person:
 332                     p = getattr(book_info, field.name)
 333                     if isinstance(p, dcparser.Person):
 334                         persons = unicode(p)
 335                     else:
 336                         persons = ', '.join(map(unicode, p))
 337                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 338                 elif type_indicator == dcparser.as_date:
 339                     dt = getattr(book_info, field.name)
 340                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 341                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 342
 343         # get published date
 344         pd = None
 345         if hasattr(book_info, 'source_name') and book_info.source_name:
 346             match = self.published_date_re.search(book_info.source_name)
 347             if match is not None:
 348                 pd = str(match.groups()[0])
 349         if not pd: pd = ""
 350         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 351
 352         return fields
 353
 354     def add_gaps(self, fields, fieldname):
 355         """
 356         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 357         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 358         """
 359         def gap():
 360             while True:
 361                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 362         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 363
 364     def get_master(self, root):
 365         """
 366         Returns the first master tag from an etree.
 367         """
 368         for master in root.iter():
 369             if master.tag in self.master_tags:
 370                 return master
 371
 372     def index_content(self, book, book_fields=[]):
 373         """
 374         Walks the book XML and extract content from it.
 375         Adds parts for each header tag and for each fragment.
 376         """
 377         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 378         root = wld.edoc.getroot()
 379
 380         master = self.get_master(root)
 381         if master is None:
 382             return []
 383
 384         def walker(node, ignore_tags=[]):
 385
 386             if node.tag not in ignore_tags:
 387                 yield node, None, None
 388                 if node.text is not None:
 389                     yield None, node.text, None
 390                 for child in list(node):
 391                     for b, t, e in walker(child):
 392                         yield b, t, e
 393                 yield None, None, node
 394
 395             if node.tail is not None:
 396                 yield None, node.tail, None
 397             return
 398
 399         def fix_format(text):
 400             #            separator = [u" ", u"\t", u".", u";", u","]
 401             if isinstance(text, list):
 402                 # need to join it first
 403                 text = filter(lambda s: s is not None, content)
 404                 text = u' '.join(text)
 405                 # for i in range(len(text)):
 406                 #     if i > 0:
 407                 #         if text[i][0] not in separator\
 408                 #             and text[i - 1][-1] not in separator:
 409                 #          text.insert(i, u" ")
 410
 411             return re.sub("(?m)/$", "", text)
 412
 413         def add_part(snippets, **fields):
 414             doc = self.create_book_doc(book)
 415             for f in book_fields:
 416                 doc.add(f)
 417
 418             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 419             doc.add(NumericField("header_span", Field.Store.YES, True)\
 420                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 421             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 422
 423             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 424                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 425
 426             snip_pos = snippets.add(fields["content"])
 427             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 428             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 429
 430             if 'fragment_anchor' in fields:
 431                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 432                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 433
 434             if 'themes' in fields:
 435                 themes, themes_pl = zip(*[
 436                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 437                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 438                      for theme in fields['themes']])
 439
 440                 themes = self.add_gaps(themes, 'themes')
 441                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 442
 443                 for t in themes:
 444                     doc.add(t)
 445                 for t in themes_pl:
 446                     doc.add(t)
 447
 448             return doc
 449
 450         def give_me_utf8(s):
 451             if isinstance(s, unicode):
 452                 return s.encode('utf-8')
 453             else:
 454                 return s
 455
 456         fragments = {}
 457         snippets = Snippets(book.id).open('w')
 458         try:
 459             for header, position in zip(list(master), range(len(master))):
 460
 461                 if header.tag in self.skip_header_tags:
 462                     continue
 463                 if header.tag is etree.Comment:
 464                     continue
 465
 466                 # section content
 467                 content = []
 468                 footnote = []
 469
 470                 def all_content(text):
 471                     for frag in fragments.values():
 472                         frag['content'].append(text)
 473                     content.append(text)
 474                 handle_text = [all_content]
 475
 476
 477                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 478                     # handle footnotes
 479                     if start is not None and start.tag in self.footnote_tags:
 480                         footnote = []
 481                         def collect_footnote(t):
 482                             footnote.append(t)
 483                         handle_text.append(collect_footnote)
 484                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 485                         handle_text.pop()
 486                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 487                                        content=u''.join(footnote),
 488                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 489
 490                         self.index.addDocument(doc)
 491                         #print "@ footnote text: %s" % footnote
 492                         footnote = []
 493
 494                     # handle fragments and themes.
 495                     if start is not None and start.tag == 'begin':
 496                         fid = start.attrib['id'][1:]
 497                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 498
 499                     # themes for this fragment
 500                     elif start is not None and start.tag == 'motyw':
 501                         fid = start.attrib['id'][1:]
 502                         handle_text.append(None)
 503                         if start.text is not None:
 504                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 505                     elif end is not None and end.tag == 'motyw':
 506                         handle_text.pop()
 507
 508                     elif start is not None and start.tag == 'end':
 509                         fid = start.attrib['id'][1:]
 510                         if fid not in fragments:
 511                             continue  # a broken <end> node, skip it
 512                         frag = fragments[fid]
 513                         if frag['themes'] == []:
 514                             continue  # empty themes list.
 515                         del fragments[fid]
 516
 517                         doc = add_part(snippets,
 518                                        header_type=frag['start_header'],
 519                                        header_index=frag['start_section'],
 520                                        header_span=position - frag['start_section'] + 1,
 521                                        fragment_anchor=fid,
 522                                        content=fix_format(frag['content']),
 523                                        themes=frag['themes'])
 524                         #print '@ FRAG %s' % frag['content']
 525                         self.index.addDocument(doc)
 526
 527                         # Collect content.
 528
 529                     if text is not None and handle_text is not []:
 530                         hdl = handle_text[-1]
 531                         if hdl is not None:
 532                             hdl(text)
 533
 534                         # in the end, add a section text.
 535                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 536                                content=fix_format(content))
 537                 #print '@ CONTENT: %s' % fix_format(content)
 538
 539                 self.index.addDocument(doc)
 540
 541         finally:
 542             snippets.close()
 543
 544
 545 def log_exception_wrapper(f):
 546     def _wrap(*a):
 547         try:
 548             f(*a)
 549         except Exception, e:
 550             print("Error in indexing thread: %s" % e)
 551             traceback.print_exc()
 552             raise e
 553     return _wrap
 554
 555
 556 class ReusableIndex(Index):
 557     """
 558     Works like index, but does not close/optimize Lucene index
 559     until program exit (uses atexit hook).
 560     This is usefull for importbooks command.
 561
 562     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 563     """
 564     index = None
 565
 566     def open(self, analyzer=None, **kw):
 567         if ReusableIndex.index:
 568             self.index = ReusableIndex.index
 569         else:
 570             print("opening index")
 571             Index.open(self, analyzer, **kw)
 572             ReusableIndex.index = self.index
 573             atexit.register(ReusableIndex.close_reusable)
 574
 575     # def index_book(self, *args, **kw):
 576     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 577     #     ReusableIndex.pool_jobs.append(job)
 578
 579     @staticmethod
 580     def close_reusable():
 581         if ReusableIndex.index:
 582             print("closing index")
 583             ReusableIndex.index.optimize()
 584             ReusableIndex.index.close()
 585             ReusableIndex.index = None
 586
 587     def close(self):
 588         if ReusableIndex.index:
 589             ReusableIndex.index.commit()
 590
 591
 592 class JoinSearch(object):
 593     """
 594     This mixin could be used to handle block join queries.
 595     (currently unused)
 596     """
 597     def __init__(self, *args, **kw):
 598         super(JoinSearch, self).__init__(*args, **kw)
 599
 600     def wrapjoins(self, query, fields=[]):
 601         """
 602         This functions modifies the query in a recursive way,
 603         so Term and Phrase Queries contained, which match
 604         provided fields are wrapped in a BlockJoinQuery,
 605         and so delegated to children documents.
 606         """
 607         if BooleanQuery.instance_(query):
 608             qs = BooleanQuery.cast_(query)
 609             for clause in qs:
 610                 clause = BooleanClause.cast_(clause)
 611                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 612             return qs
 613         else:
 614             termset = HashSet()
 615             query.extractTerms(termset)
 616             for t in termset:
 617                 t = Term.cast_(t)
 618                 if t.field() not in fields:
 619                     return query
 620             return BlockJoinQuery(query, self.parent_filter,
 621                                   BlockJoinQuery.ScoreMode.Total)
 622
 623     def bsearch(self, query, max_results=50):
 624         q = self.query(query)
 625         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 626
 627         tops = self.searcher.search(bjq, max_results)
 628         bks = []
 629         for found in tops.scoreDocs:
 630             doc = self.searcher.doc(found.doc)
 631             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 632         return (bks, tops.totalHits)
 633
 634
 635 class SearchResult(object):
 636     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 637         if tokens_cache is None: tokens_cache = {}
 638
 639         if score:
 640             self._score = score
 641         else:
 642             self._score = scoreDocs.score
 643
 644         self.boost = 1.0
 645
 646         self._hits = []
 647         self._processed_hits = None  # processed hits
 648
 649         stored = search.searcher.doc(scoreDocs.doc)
 650         self.book_id = int(stored.get("book_id"))
 651
 652         pd = stored.get("published_date")
 653         try:
 654             self.published_date = int(pd)
 655         except ValueError:
 656             self.published_date = 0
 657
 658         header_type = stored.get("header_type")
 659         # we have a content hit in some header of fragment
 660         if header_type is not None:
 661             sec = (header_type, int(stored.get("header_index")))
 662             header_span = stored.get('header_span')
 663             header_span = header_span is not None and int(header_span) or 1
 664
 665             fragment = stored.get("fragment_anchor")
 666
 667             if snippets:
 668                 snippets = snippets.replace("/\n", "\n")
 669             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 670
 671             self._hits.append(hit)
 672
 673         self.search = search
 674         self.searched = searched
 675         self.tokens_cache = tokens_cache
 676
 677     @property
 678     def score(self):
 679         return self._score * self.boost
 680
 681     def merge(self, other):
 682         if self.book_id != other.book_id:
 683             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 684         self._hits += other._hits
 685         if other.score > self.score:
 686             self._score = other._score
 687         return self
 688
 689     def get_book(self):
 690         return catalogue.models.Book.objects.get(id=self.book_id)
 691
 692     book = property(get_book)
 693
 694     @property
 695     def hits(self):
 696         if self._processed_hits is not None:
 697             return self._processed_hits
 698
 699         POSITION = 0
 700         FRAGMENT = 1
 701         POSITION_INDEX = 1
 702         POSITION_SPAN = 2
 703         SCORE = 2
 704         OTHER = 3
 705
 706         # to sections and fragments
 707         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 708         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 709         sect = filter(lambda s: 0 == len(filter(
 710             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 711             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 712             frags)), sect)
 713
 714         hits = []
 715
 716         # remove duplicate fragments
 717         fragments = {}
 718         for f in frags:
 719             fid = f[FRAGMENT]
 720             if fid in fragments:
 721                 if fragments[fid][SCORE] >= f[SCORE]:
 722                     continue
 723             fragments[fid] = f
 724         frags = fragments.values()
 725
 726         # remove duplicate sections
 727         sections = {}
 728
 729         for s in sect:
 730             si = s[POSITION][POSITION_INDEX]
 731             # skip existing
 732             if si in sections:
 733                 if sections[si]['score'] >= s[SCORE]:
 734                     continue
 735
 736             m = {'score': s[SCORE],
 737                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 738                  }
 739             m.update(s[OTHER])
 740             sections[si] = m
 741
 742         hits = sections.values()
 743
 744         for f in frags:
 745             try:
 746                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 747             except catalogue.models.Fragment.DoesNotExist:
 748                 # stale index
 749                 continue
 750
 751             # Figure out if we were searching for a token matching some word in theme name.
 752             themes = frag.tags.filter(category='theme')
 753             themes_hit = []
 754             if self.searched is not None:
 755                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 756                 for theme in themes:
 757                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 758                     for t in tokens:
 759                         if t in name_tokens:
 760                             if not theme in themes_hit:
 761                                 themes_hit.append(theme)
 762                             break
 763
 764             m = {'score': f[SCORE],
 765                  'fragment': frag,
 766                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 767                  'themes': themes,
 768                  'themes_hit': themes_hit
 769                  }
 770             m.update(f[OTHER])
 771             hits.append(m)
 772
 773         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 774
 775         self._processed_hits = hits
 776
 777         return hits
 778
 779     def __unicode__(self):
 780         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 781
 782     @staticmethod
 783     def aggregate(*result_lists):
 784         books = {}
 785         for rl in result_lists:
 786             for r in rl:
 787                 if r.book_id in books:
 788                     books[r.book_id].merge(r)
 789                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 790                 else:
 791                     books[r.book_id] = r
 792         return books.values()
 793
 794     def __cmp__(self, other):
 795         c = cmp(self.score, other.score)
 796         if c == 0:
 797             # this is inverted, because earlier date is better
 798             return cmp(other.published_date, self.published_date)
 799         else:
 800             return c
 801
 802
 803 class Hint(object):
 804     """
 805     Given some hint information (information we already know about)
 806     our search target - like author, title (specific book), epoch, genre, kind
 807     we can narrow down search using filters.
 808     """
 809     def __init__(self, search):
 810         """
 811         Accepts a Searcher instance.
 812         """
 813         self.search = search
 814         self.book_tags = {}
 815         self.part_tags = []
 816         self._books = []
 817
 818     def books(self, *books):
 819         """
 820         Give a hint that we search these books.
 821         """
 822         self._books = books
 823
 824     def tags(self, tags):
 825         """
 826         Give a hint that these Tag objects (a list of)
 827         is necessary.
 828         """
 829         for t in tags:
 830             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 831                 lst = self.book_tags.get(t.category, [])
 832                 lst.append(t)
 833                 self.book_tags[t.category] = lst
 834             if t.category in ['theme', 'theme_pl']:
 835                 self.part_tags.append(t)
 836
 837     def tag_filter(self, tags, field='tags'):
 838         """
 839         Given a lsit of tags and an optional field (but they are normally in tags field)
 840         returns a filter accepting only books with specific tags.
 841         """
 842         q = BooleanQuery()
 843
 844         for tag in tags:
 845             toks = self.search.get_tokens(tag.name, field=field)
 846             tag_phrase = PhraseQuery()
 847             for tok in toks:
 848                 tag_phrase.add(Term(field, tok))
 849             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 850
 851         return QueryWrapperFilter(q)
 852
 853     def book_filter(self):
 854         """
 855         Filters using book tags (all tag kinds except a theme)
 856         """
 857         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 858         if tags:
 859             return self.tag_filter(tags)
 860         else:
 861             return None
 862
 863     def part_filter(self):
 864         """
 865         This filter can be used to look for book parts.
 866         It filters on book id and/or themes.
 867         """
 868         fs = []
 869         if self.part_tags:
 870             fs.append(self.tag_filter(self.part_tags, field='themes'))
 871
 872         if self._books != []:
 873             bf = BooleanFilter()
 874             for b in self._books:
 875                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 876                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 877             fs.append(bf)
 878
 879         return Search.chain_filters(fs)
 880
 881     def should_search_for_book(self):
 882         return self._books == []
 883
 884     def just_search_in(self, all):
 885         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 886         some = []
 887         for field in all:
 888             if field == 'authors' and 'author' in self.book_tags:
 889                 continue
 890             if field == 'title' and self._books != []:
 891                 continue
 892             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 893                 continue
 894             some.append(field)
 895         return some
 896
 897
 898 class Search(IndexStore):
 899     """
 900     Search facilities.
 901     """
 902     def __init__(self, default_field="content"):
 903         IndexStore.__init__(self)
 904         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 905         # self.analyzer = WLAnalyzer()
 906         self.searcher = IndexSearcher(self.store, True)
 907         self.parser = QueryParser(Version.LUCENE_34, default_field,
 908                                   self.analyzer)
 909
 910         self.parent_filter = TermsFilter()
 911         self.parent_filter.addTerm(Term("is_book", "true"))
 912
 913     def query(self, query):
 914         """Parse query in default Lucene Syntax. (for humans)
 915         """
 916         return self.parser.parse(query)
 917
 918     def simple_search(self, query, max_results=50):
 919         """Runs a query for books using lucene syntax. (for humans)
 920         Returns (books, total_hits)
 921         """
 922
 923         tops = self.searcher.search(self.query(query), max_results)
 924         bks = []
 925         for found in tops.scoreDocs:
 926             doc = self.searcher.doc(found.doc)
 927             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 928         return (bks, tops.totalHits)
 929
 930     def get_tokens(self, searched, field='content', cached=None):
 931         """returns tokens analyzed by a proper (for a field) analyzer
 932         argument can be: StringReader, string/unicode, or tokens. In the last case
 933         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 934         """
 935         if cached is not None and field in cached:
 936             return cached[field]
 937
 938         if isinstance(searched, str) or isinstance(searched, unicode):
 939             searched = StringReader(searched)
 940         elif isinstance(searched, list):
 941             return searched
 942
 943         searched.reset()
 944         tokens = self.analyzer.reusableTokenStream(field, searched)
 945         toks = []
 946         while tokens.incrementToken():
 947             cta = tokens.getAttribute(CharTermAttribute.class_)
 948             toks.append(cta.toString())
 949
 950         if cached is not None:
 951             cached[field] = toks
 952
 953         return toks
 954
 955     def fuzziness(self, fuzzy):
 956         """Helper method to sanitize fuzziness"""
 957         if not fuzzy:
 958             return None
 959         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 960             return fuzzy
 961         else:
 962             return 0.5
 963
 964     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 965         """
 966         Return a PhraseQuery with a series of tokens.
 967         """
 968         if fuzzy:
 969             phrase = MultiPhraseQuery()
 970             for t in tokens:
 971                 term = Term(field, t)
 972                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 973                 fuzzterms = []
 974
 975                 while True:
 976                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 977                     ft = fuzzterm.term()
 978                     if ft:
 979                         fuzzterms.append(ft)
 980                     if not fuzzterm.next(): break
 981                 if fuzzterms:
 982                     phrase.add(JArray('object')(fuzzterms, Term))
 983                 else:
 984                     phrase.add(term)
 985         else:
 986             phrase = PhraseQuery()
 987             phrase.setSlop(slop)
 988             for t in tokens:
 989                 term = Term(field, t)
 990                 phrase.add(term)
 991         return phrase
 992
 993     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 994         """
 995         Returns term queries joined by boolean query.
 996         modal - applies to boolean query
 997         fuzzy - should the query by fuzzy.
 998         """
 999         q = BooleanQuery()
1000         for t in tokens:
1001             term = Term(field, t)
1002             if fuzzy:
1003                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1004             else:
1005                 term = TermQuery(term)
1006             q.add(BooleanClause(term, modal))
1007         return q
1008
1009     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1010                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1011         if filters is None: filters = []
1012         if tokens_cache is None: tokens_cache = {}
1013
1014         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1015
1016         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1017         if book:
1018             filters.append(self.term_filter(Term('is_book', 'true')))
1019         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1020
1021         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1022
1023     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1024                     filters=None, tokens_cache=None, boost=None, snippets=True):
1025         if filters is None: filters = []
1026         if tokens_cache is None: tokens_cache = {}
1027
1028         if book:
1029             filters.append(self.term_filter(Term('is_book', 'true')))
1030
1031         query = BooleanQuery()
1032
1033         for fld in fields:
1034             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1035
1036             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1037                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1038
1039         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1040
1041         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1042                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1043
1044     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1045         """
1046         Search for perfect book matches. Just see if the query matches with some author or title,
1047         taking hints into account.
1048         """
1049         fields_to_search = ['authors', 'title']
1050         only_in = None
1051         if hint:
1052             if not hint.should_search_for_book():
1053                 return []
1054             fields_to_search = hint.just_search_in(fields_to_search)
1055             only_in = hint.book_filter()
1056
1057         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1058
1059         books = []
1060         for q in qrys:
1061             top = self.searcher.search(q,
1062                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1063                 max_results)
1064             for found in top.scoreDocs:
1065                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1066         return books
1067
1068     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1069         fields_to_search = ['tags', 'authors', 'title']
1070
1071         only_in = None
1072         if hint:
1073             if not hint.should_search_for_book():
1074                 return []
1075             fields_to_search = hint.just_search_in(fields_to_search)
1076             only_in = hint.book_filter()
1077
1078         tokens = self.get_tokens(searched, field='SIMPLE')
1079
1080         q = BooleanQuery()
1081
1082         for fld in fields_to_search:
1083             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1084                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1085
1086         books = []
1087         top = self.searcher.search(q,
1088                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1089             max_results)
1090         for found in top.scoreDocs:
1091             books.append(SearchResult(self, found, how_found="search_book"))
1092
1093         return books
1094
1095     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1096         """
1097         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1098         some part/fragment of the book.
1099         """
1100         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1101
1102         flt = None
1103         if hint:
1104             flt = hint.part_filter()
1105
1106         books = []
1107         for q in qrys:
1108             top = self.searcher.search(q,
1109                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1110                                                            flt]),
1111                                        max_results)
1112             for found in top.scoreDocs:
1113                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1114
1115         return books
1116
1117     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1118         """
1119         Tries to use search terms to match different fields of book (or its parts).
1120         E.g. one word can be an author survey, another be a part of the title, and the rest
1121         are some words from third chapter.
1122         """
1123         if tokens_cache is None: tokens_cache = {}
1124         books = []
1125         only_in = None
1126
1127         if hint:
1128             only_in = hint.part_filter()
1129
1130         # content only query : themes x content
1131         q = BooleanQuery()
1132
1133         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1134         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1135
1136         # only search in themes when we do not already filter by themes
1137         if hint is None or hint.just_search_in(['themes']) != []:
1138             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1139                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1140
1141         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1142                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1143
1144         topDocs = self.searcher.search(q, only_in, max_results)
1145         for found in topDocs.scoreDocs:
1146             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1147             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1148
1149         # query themes/content x author/title/tags
1150         q = BooleanQuery()
1151         in_content = BooleanQuery()
1152         in_meta = BooleanQuery()
1153
1154         for fld in ['themes_pl', 'content']:
1155             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1156
1157         for fld in ['tags', 'authors', 'title']:
1158             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1159
1160         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1161         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1162
1163         topDocs = self.searcher.search(q, only_in, max_results)
1164         for found in topDocs.scoreDocs:
1165             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1166             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1167
1168         return books
1169
1170     # def multisearch(self, query, max_results=50):
1171     #     """
1172     #     Search strategy:
1173     #     - (phrase) OR -> content
1174     #                   -> title
1175     #                   -> authors
1176     #     - (keywords)  -> authors
1177     #                   -> motyw
1178     #                   -> tags
1179     #                   -> content
1180     #     """
1181         # queryreader = StringReader(query)
1182         # tokens = self.get_tokens(queryreader)
1183
1184         # top_level = BooleanQuery()
1185         # Should = BooleanClause.Occur.SHOULD
1186
1187         # phrase_level = BooleanQuery()
1188         # phrase_level.setBoost(1.3)
1189
1190         # p_content = self.make_phrase(tokens, joined=True)
1191         # p_title = self.make_phrase(tokens, 'title')
1192         # p_author = self.make_phrase(tokens, 'author')
1193
1194         # phrase_level.add(BooleanClause(p_content, Should))
1195         # phrase_level.add(BooleanClause(p_title, Should))
1196         # phrase_level.add(BooleanClause(p_author, Should))
1197
1198         # kw_level = BooleanQuery()
1199
1200         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1201         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1202         # kw_level.add(j_themes, Should)
1203         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1204         # j_con = self.make_term_query(tokens, joined=True)
1205         # kw_level.add(j_con, Should)
1206
1207         # top_level.add(BooleanClause(phrase_level, Should))
1208         # top_level.add(BooleanClause(kw_level, Should))
1209
1210         # return None
1211
1212     def get_snippets(self, scoreDoc, query, field='content'):
1213         """
1214         Returns a snippet for found scoreDoc.
1215         """
1216         htmlFormatter = SimpleHTMLFormatter()
1217         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1218
1219         stored = self.searcher.doc(scoreDoc.doc)
1220
1221         position = stored.get('snippets_position')
1222         length = stored.get('snippets_length')
1223         if position is None or length is None:
1224             return None
1225         # locate content.
1226         book_id = int(stored.get('book_id'))
1227         snippets = Snippets(book_id).open()
1228         try:
1229             try:
1230                 text = snippets.get((int(position),
1231                                      int(length)))
1232             finally:
1233                 snippets.close()
1234
1235             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1236             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1237             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1238
1239         except Exception, e:
1240             e2 = e
1241             if hasattr(e, 'getJavaException'):
1242                 e2 = unicode(e.getJavaException())
1243             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1244                 e2)
1245         return snip
1246
1247     @staticmethod
1248     def enum_to_array(enum):
1249         """
1250         Converts a lucene TermEnum to array of Terms, suitable for
1251         addition to queries
1252         """
1253         terms = []
1254
1255         while True:
1256             t = enum.term()
1257             if t:
1258                 terms.append(t)
1259             if not enum.next(): break
1260
1261         if terms:
1262             return JArray('object')(terms, Term)
1263
1264     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1265         """
1266         Search for Tag objects using query.
1267         """
1268         if not pdcounter:
1269             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1270         tops = self.searcher.search(query, filters, max_results)
1271
1272         tags = []
1273         for found in tops.scoreDocs:
1274             doc = self.searcher.doc(found.doc)
1275             is_pdcounter = doc.get('is_pdcounter')
1276             category = doc.get('tag_category')
1277             if is_pdcounter == 'true':
1278                 if category == 'pd_author':
1279                     tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1280                 elif category == 'pd_book':
1281                     tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1282                     tag.category = 'pd_book'  # make it look more lik a tag.
1283                 else:
1284                     print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1285             else:
1286                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1287                 # don't add the pdcounter tag if same tag already exists
1288             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1289                 tags.append(tag)
1290                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1291         print 'returning %s' % tags
1292         return tags
1293
1294     def search_books(self, query, filter=None, max_results=10):
1295         """
1296         Searches for Book objects using query
1297         """
1298         bks = []
1299         tops = self.searcher.search(query, filter, max_results)
1300         for found in tops.scoreDocs:
1301             doc = self.searcher.doc(found.doc)
1302             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1303         return bks
1304
1305     def make_prefix_phrase(self, toks, field):
1306         q = MultiPhraseQuery()
1307         for i in range(len(toks)):
1308             t = Term(field, toks[i])
1309             if i == len(toks) - 1:
1310                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1311                 if pterms:
1312                     q.add(pterms)
1313                 else:
1314                     q.add(t)
1315             else:
1316                 q.add(t)
1317         return q
1318
1319     @staticmethod
1320     def term_filter(term, inverse=False):
1321         only_term = TermsFilter()
1322         only_term.addTerm(term)
1323
1324         if inverse:
1325             neg = BooleanFilter()
1326             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1327             only_term = neg
1328
1329         return only_term
1330
1331     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1332         """
1333         Return auto-complete hints for tags
1334         using prefix search.
1335         """
1336         toks = self.get_tokens(string, field='SIMPLE')
1337         top = BooleanQuery()
1338
1339         for field in ['tag_name', 'tag_name_pl']:
1340             if prefix:
1341                 q = self.make_prefix_phrase(toks, field)
1342             else:
1343                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1344             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1345
1346         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1347
1348         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1349
1350     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1351         """
1352         Returns auto-complete hints for book titles
1353         Because we do not index 'pseudo' title-tags.
1354         Prefix search.
1355         """
1356         toks = self.get_tokens(string, field='SIMPLE')
1357
1358         if prefix:
1359             q = self.make_prefix_phrase(toks, 'title')
1360         else:
1361             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1362
1363         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1364
1365     @staticmethod
1366     def chain_filters(filters, op=ChainedFilter.AND):
1367         """
1368         Chains a filter list together
1369         """
1370         filters = filter(lambda x: x is not None, filters)
1371         if not filters or filters is []:
1372             return None
1373         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1374         return chf
1375
1376     def filtered_categories(self, tags):
1377         """
1378         Return a list of tag categories, present in tags list.
1379         """
1380         cats = {}
1381         for t in tags:
1382             cats[t.category] = True
1383         return cats.keys()
1384
1385     def hint(self):
1386         return Hint(self)