apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, timeout=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 176         if timeout:
 177             conf.setWriteLockTimeout(long(timeout))
 178         self.index = IndexWriter(self.store, conf)
 179         return self.index
 180
 181     def optimize(self):
 182         self.index.optimize()
 183
 184     def close(self):
 185         try:
 186             self.index.optimize()
 187         except JavaError, je:
 188             print "Error during optimize phase, check index: %s" % je
 189
 190         self.index.close()
 191         self.index = None
 192
 193     def __enter__(self):
 194         self.open()
 195         return self
 196
 197     def __exit__(self, type, value, tb):
 198         self.close()
 199
 200
 201 class Index(BaseIndex):
 202     """
 203     Class indexing books.
 204     """
 205     def __init__(self, analyzer=None):
 206         super(Index, self).__init__(analyzer)
 207
 208     def index_tags(self):
 209         """
 210         Re-index global tag list.
 211         Removes all tags from index, then index them again.
 212         Indexed fields include: id, name (with and without polish stems), category
 213         """
 214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 215         self.index.deleteDocuments(q)
 216
 217         for tag in catalogue.models.Tag.objects.all():
 218             doc = Document()
 219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 223             self.index.addDocument(doc)
 224
 225         for pdtag in PDCounterAuthor.objects.all():
 226             doc = Document()
 227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 230             doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 232             self.index.addDocument(doc)
 233
 234         for pdtag in PDCounterBook.objects.all():
 235             doc = Document()
 236             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 237             doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
 238             doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
 239             doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 240             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 241             self.index.addDocument(doc)
 242
 243     def create_book_doc(self, book):
 244         """
 245         Create a lucene document referring book id.
 246         """
 247         doc = Document()
 248         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 249         if book.parent is not None:
 250             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 251         return doc
 252
 253     def remove_book(self, book):
 254         """Removes a book from search index.
 255         book - Book instance."""
 256         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 257         self.index.deleteDocuments(q)
 258
 259     def index_book(self, book, book_info=None, overwrite=True):
 260         """
 261         Indexes the book.
 262         Creates a lucene document for extracted metadata
 263         and calls self.index_content() to index the contents of the book.
 264         """
 265         if overwrite:
 266             self.remove_book(book)
 267
 268         book_doc = self.create_book_doc(book)
 269         meta_fields = self.extract_metadata(book, book_info)
 270         for f in meta_fields.values():
 271             if isinstance(f, list) or isinstance(f, tuple):
 272                 for elem in f:
 273                     book_doc.add(elem)
 274             else:
 275                 book_doc.add(f)
 276         self.index.addDocument(book_doc)
 277         del book_doc
 278
 279         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 280
 281     master_tags = [
 282         'opowiadanie',
 283         'powiesc',
 284         'dramat_wierszowany_l',
 285         'dramat_wierszowany_lp',
 286         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 287         'wywiad',
 288         ]
 289
 290     ignore_content_tags = [
 291         'uwaga', 'extra',
 292         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 293         'didaskalia',
 294         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 295         ]
 296
 297     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 298
 299     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 300
 301     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 302
 303     def extract_metadata(self, book, book_info=None):
 304         """
 305         Extract metadata from book and returns a map of fields keyed by fieldname
 306         """
 307         fields = {}
 308
 309         if book_info is None:
 310             book_info = dcparser.parse(open(book.xml_file.path))
 311
 312         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 313         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 314         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 315
 316         # validator, name
 317         for field in dcparser.BookInfo.FIELDS:
 318             if hasattr(book_info, field.name):
 319                 if not getattr(book_info, field.name):
 320                     continue
 321                 # since no type information is available, we use validator
 322                 type_indicator = field.validator
 323                 if type_indicator == dcparser.as_unicode:
 324                     s = getattr(book_info, field.name)
 325                     if field.multiple:
 326                         s = ', '.join(s)
 327                     try:
 328                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 329                     except JavaError as je:
 330                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 331                 elif type_indicator == dcparser.as_person:
 332                     p = getattr(book_info, field.name)
 333                     if isinstance(p, dcparser.Person):
 334                         persons = unicode(p)
 335                     else:
 336                         persons = ', '.join(map(unicode, p))
 337                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 338                 elif type_indicator == dcparser.as_date:
 339                     dt = getattr(book_info, field.name)
 340                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 341                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 342
 343         # get published date
 344         pd = None
 345         if hasattr(book_info, 'source_name') and book_info.source_name:
 346             match = self.published_date_re.search(book_info.source_name)
 347             if match is not None:
 348                 pd = str(match.groups()[0])
 349         if not pd: pd = ""
 350         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 351
 352         return fields
 353
 354     def add_gaps(self, fields, fieldname):
 355         """
 356         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 357         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 358         """
 359         def gap():
 360             while True:
 361                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 362         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 363
 364     def get_master(self, root):
 365         """
 366         Returns the first master tag from an etree.
 367         """
 368         for master in root.iter():
 369             if master.tag in self.master_tags:
 370                 return master
 371
 372     def index_content(self, book, book_fields=[]):
 373         """
 374         Walks the book XML and extract content from it.
 375         Adds parts for each header tag and for each fragment.
 376         """
 377         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 378         root = wld.edoc.getroot()
 379
 380         master = self.get_master(root)
 381         if master is None:
 382             return []
 383
 384         def walker(node, ignore_tags=[]):
 385
 386             if node.tag not in ignore_tags:
 387                 yield node, None, None
 388                 if node.text is not None:
 389                     yield None, node.text, None
 390                 for child in list(node):
 391                     for b, t, e in walker(child):
 392                         yield b, t, e
 393                 yield None, None, node
 394
 395             if node.tail is not None:
 396                 yield None, node.tail, None
 397             return
 398
 399         def fix_format(text):
 400             #            separator = [u" ", u"\t", u".", u";", u","]
 401             if isinstance(text, list):
 402                 # need to join it first
 403                 text = filter(lambda s: s is not None, content)
 404                 text = u' '.join(text)
 405                 # for i in range(len(text)):
 406                 #     if i > 0:
 407                 #         if text[i][0] not in separator\
 408                 #             and text[i - 1][-1] not in separator:
 409                 #          text.insert(i, u" ")
 410
 411             return re.sub("(?m)/$", "", text)
 412
 413         def add_part(snippets, **fields):
 414             doc = self.create_book_doc(book)
 415             for f in book_fields:
 416                 doc.add(f)
 417
 418             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 419             doc.add(NumericField("header_span", Field.Store.YES, True)\
 420                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 421             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 422
 423             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 424                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 425
 426             snip_pos = snippets.add(fields["content"])
 427             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 428             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 429
 430             if 'fragment_anchor' in fields:
 431                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 432                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 433
 434             if 'themes' in fields:
 435                 themes, themes_pl = zip(*[
 436                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 437                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 438                      for theme in fields['themes']])
 439
 440                 themes = self.add_gaps(themes, 'themes')
 441                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 442
 443                 for t in themes:
 444                     doc.add(t)
 445                 for t in themes_pl:
 446                     doc.add(t)
 447
 448             return doc
 449
 450         def give_me_utf8(s):
 451             if isinstance(s, unicode):
 452                 return s.encode('utf-8')
 453             else:
 454                 return s
 455
 456         fragments = {}
 457         snippets = Snippets(book.id).open('w')
 458         try:
 459             for header, position in zip(list(master), range(len(master))):
 460
 461                 if header.tag in self.skip_header_tags:
 462                     continue
 463                 if header.tag is etree.Comment:
 464                     continue
 465
 466                 # section content
 467                 content = []
 468                 footnote = []
 469
 470                 def all_content(text):
 471                     for frag in fragments.values():
 472                         frag['content'].append(text)
 473                     content.append(text)
 474                 handle_text = [all_content]
 475
 476
 477                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 478                     # handle footnotes
 479                     if start is not None and start.tag in self.footnote_tags:
 480                         footnote = []
 481                         def collect_footnote(t):
 482                             footnote.append(t)
 483                         handle_text.append(collect_footnote)
 484                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 485                         handle_text.pop()
 486                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 487                                        content=u''.join(footnote),
 488                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 489
 490                         self.index.addDocument(doc)
 491                         #print "@ footnote text: %s" % footnote
 492                         footnote = []
 493
 494                     # handle fragments and themes.
 495                     if start is not None and start.tag == 'begin':
 496                         fid = start.attrib['id'][1:]
 497                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 498
 499                     # themes for this fragment
 500                     elif start is not None and start.tag == 'motyw':
 501                         fid = start.attrib['id'][1:]
 502                         handle_text.append(None)
 503                         if start.text is not None:
 504                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 505                     elif end is not None and end.tag == 'motyw':
 506                         handle_text.pop()
 507
 508                     elif start is not None and start.tag == 'end':
 509                         fid = start.attrib['id'][1:]
 510                         if fid not in fragments:
 511                             continue  # a broken <end> node, skip it
 512                         frag = fragments[fid]
 513                         if frag['themes'] == []:
 514                             continue  # empty themes list.
 515                         del fragments[fid]
 516
 517                         doc = add_part(snippets,
 518                                        header_type=frag['start_header'],
 519                                        header_index=frag['start_section'],
 520                                        header_span=position - frag['start_section'] + 1,
 521                                        fragment_anchor=fid,
 522                                        content=fix_format(frag['content']),
 523                                        themes=frag['themes'])
 524                         #print '@ FRAG %s' % frag['content']
 525                         self.index.addDocument(doc)
 526
 527                         # Collect content.
 528
 529                     if text is not None and handle_text is not []:
 530                         hdl = handle_text[-1]
 531                         if hdl is not None:
 532                             hdl(text)
 533
 534                         # in the end, add a section text.
 535                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 536                                content=fix_format(content))
 537                 #print '@ CONTENT: %s' % fix_format(content)
 538
 539                 self.index.addDocument(doc)
 540
 541         finally:
 542             snippets.close()
 543
 544
 545 def log_exception_wrapper(f):
 546     def _wrap(*a):
 547         try:
 548             f(*a)
 549         except Exception, e:
 550             print("Error in indexing thread: %s" % e)
 551             traceback.print_exc()
 552             raise e
 553     return _wrap
 554
 555
 556 class ReusableIndex(Index):
 557     """
 558     Works like index, but does not close/optimize Lucene index
 559     until program exit (uses atexit hook).
 560     This is usefull for importbooks command.
 561
 562     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 563     """
 564     index = None
 565
 566     def open(self, analyzer=None, **kw):
 567         if ReusableIndex.index:
 568             self.index = ReusableIndex.index
 569         else:
 570             print("opening index")
 571             Index.open(self, analyzer, **kw)
 572             ReusableIndex.index = self.index
 573             atexit.register(ReusableIndex.close_reusable)
 574
 575     # def index_book(self, *args, **kw):
 576     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 577     #     ReusableIndex.pool_jobs.append(job)
 578
 579     @staticmethod
 580     def close_reusable():
 581         if ReusableIndex.index:
 582             print("closing index")
 583             ReusableIndex.index.optimize()
 584             ReusableIndex.index.close()
 585             ReusableIndex.index = None
 586
 587     def close(self):
 588         if ReusableIndex.index:
 589             ReusableIndex.index.commit()
 590
 591
 592 class JoinSearch(object):
 593     """
 594     This mixin could be used to handle block join queries.
 595     (currently unused)
 596     """
 597     def __init__(self, *args, **kw):
 598         super(JoinSearch, self).__init__(*args, **kw)
 599
 600     def wrapjoins(self, query, fields=[]):
 601         """
 602         This functions modifies the query in a recursive way,
 603         so Term and Phrase Queries contained, which match
 604         provided fields are wrapped in a BlockJoinQuery,
 605         and so delegated to children documents.
 606         """
 607         if BooleanQuery.instance_(query):
 608             qs = BooleanQuery.cast_(query)
 609             for clause in qs:
 610                 clause = BooleanClause.cast_(clause)
 611                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 612             return qs
 613         else:
 614             termset = HashSet()
 615             query.extractTerms(termset)
 616             for t in termset:
 617                 t = Term.cast_(t)
 618                 if t.field() not in fields:
 619                     return query
 620             return BlockJoinQuery(query, self.parent_filter,
 621                                   BlockJoinQuery.ScoreMode.Total)
 622
 623     def bsearch(self, query, max_results=50):
 624         q = self.query(query)
 625         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 626
 627         tops = self.searcher.search(bjq, max_results)
 628         bks = []
 629         for found in tops.scoreDocs:
 630             doc = self.searcher.doc(found.doc)
 631             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 632         return (bks, tops.totalHits)
 633
 634
 635 class SearchResult(object):
 636     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 637         if tokens_cache is None: tokens_cache = {}
 638
 639         if score:
 640             self._score = score
 641         else:
 642             self._score = scoreDocs.score
 643
 644         self.boost = 1.0
 645
 646         self._hits = []
 647         self._processed_hits = None  # processed hits
 648
 649         stored = search.searcher.doc(scoreDocs.doc)
 650         self.book_id = int(stored.get("book_id"))
 651
 652         pd = stored.get("published_date")
 653         if pd is None:
 654             pd = 0
 655         self.published_date = int(pd)
 656
 657         header_type = stored.get("header_type")
 658         # we have a content hit in some header of fragment
 659         if header_type is not None:
 660             sec = (header_type, int(stored.get("header_index")))
 661             header_span = stored.get('header_span')
 662             header_span = header_span is not None and int(header_span) or 1
 663
 664             fragment = stored.get("fragment_anchor")
 665
 666             if snippets:
 667                 snippets = snippets.replace("/\n", "\n")
 668             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 669
 670             self._hits.append(hit)
 671
 672         self.search = search
 673         self.searched = searched
 674         self.tokens_cache = tokens_cache
 675
 676     @property
 677     def score(self):
 678         return self._score * self.boost
 679
 680     def merge(self, other):
 681         if self.book_id != other.book_id:
 682             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 683         self._hits += other._hits
 684         if other.score > self.score:
 685             self._score = other._score
 686         return self
 687
 688     def get_book(self):
 689         return catalogue.models.Book.objects.get(id=self.book_id)
 690
 691     book = property(get_book)
 692
 693     @property
 694     def hits(self):
 695         if self._processed_hits is not None:
 696             return self._processed_hits
 697
 698         POSITION = 0
 699         FRAGMENT = 1
 700         POSITION_INDEX = 1
 701         POSITION_SPAN = 2
 702         SCORE = 2
 703         OTHER = 3
 704
 705         # to sections and fragments
 706         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 707         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 708         sect = filter(lambda s: 0 == len(filter(
 709             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 710             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 711             frags)), sect)
 712
 713         hits = []
 714
 715         # remove duplicate fragments
 716         fragments = {}
 717         for f in frags:
 718             fid = f[FRAGMENT]
 719             if fid in fragments:
 720                 if fragments[fid][SCORE] >= f[SCORE]:
 721                     continue
 722             fragments[fid] = f
 723         frags = fragments.values()
 724
 725         # remove duplicate sections
 726         sections = {}
 727
 728         for s in sect:
 729             si = s[POSITION][POSITION_INDEX]
 730             # skip existing
 731             if si in sections:
 732                 if sections[si]['score'] >= s[SCORE]:
 733                     continue
 734
 735             m = {'score': s[SCORE],
 736                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 737                  }
 738             m.update(s[OTHER])
 739             sections[si] = m
 740
 741         hits = sections.values()
 742
 743         for f in frags:
 744             try:
 745                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 746             except catalogue.models.Fragment.DoesNotExist:
 747                 # stale index
 748                 continue
 749
 750             # Figure out if we were searching for a token matching some word in theme name.
 751             themes = frag.tags.filter(category='theme')
 752             themes_hit = []
 753             if self.searched is not None:
 754                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 755                 for theme in themes:
 756                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 757                     for t in tokens:
 758                         if t in name_tokens:
 759                             if not theme in themes_hit:
 760                                 themes_hit.append(theme)
 761                             break
 762
 763             m = {'score': f[SCORE],
 764                  'fragment': frag,
 765                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 766                  'themes': themes,
 767                  'themes_hit': themes_hit
 768                  }
 769             m.update(f[OTHER])
 770             hits.append(m)
 771
 772         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 773
 774         self._processed_hits = hits
 775
 776         return hits
 777
 778     def __unicode__(self):
 779         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 780
 781     @staticmethod
 782     def aggregate(*result_lists):
 783         books = {}
 784         for rl in result_lists:
 785             for r in rl:
 786                 if r.book_id in books:
 787                     books[r.book_id].merge(r)
 788                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 789                 else:
 790                     books[r.book_id] = r
 791         return books.values()
 792
 793     def __cmp__(self, other):
 794         c = cmp(self.score, other.score)
 795         if c == 0:
 796             # this is inverted, because earlier date is better
 797             return cmp(other.published_date, self.published_date)
 798         else:
 799             return c
 800
 801
 802 class Hint(object):
 803     """
 804     Given some hint information (information we already know about)
 805     our search target - like author, title (specific book), epoch, genre, kind
 806     we can narrow down search using filters.
 807     """
 808     def __init__(self, search):
 809         """
 810         Accepts a Searcher instance.
 811         """
 812         self.search = search
 813         self.book_tags = {}
 814         self.part_tags = []
 815         self._books = []
 816
 817     def books(self, *books):
 818         """
 819         Give a hint that we search these books.
 820         """
 821         self._books = books
 822
 823     def tags(self, tags):
 824         """
 825         Give a hint that these Tag objects (a list of)
 826         is necessary.
 827         """
 828         for t in tags:
 829             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 830                 lst = self.book_tags.get(t.category, [])
 831                 lst.append(t)
 832                 self.book_tags[t.category] = lst
 833             if t.category in ['theme', 'theme_pl']:
 834                 self.part_tags.append(t)
 835
 836     def tag_filter(self, tags, field='tags'):
 837         """
 838         Given a lsit of tags and an optional field (but they are normally in tags field)
 839         returns a filter accepting only books with specific tags.
 840         """
 841         q = BooleanQuery()
 842
 843         for tag in tags:
 844             toks = self.search.get_tokens(tag.name, field=field)
 845             tag_phrase = PhraseQuery()
 846             for tok in toks:
 847                 tag_phrase.add(Term(field, tok))
 848             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 849
 850         return QueryWrapperFilter(q)
 851
 852     def book_filter(self):
 853         """
 854         Filters using book tags (all tag kinds except a theme)
 855         """
 856         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 857         if tags:
 858             return self.tag_filter(tags)
 859         else:
 860             return None
 861
 862     def part_filter(self):
 863         """
 864         This filter can be used to look for book parts.
 865         It filters on book id and/or themes.
 866         """
 867         fs = []
 868         if self.part_tags:
 869             fs.append(self.tag_filter(self.part_tags, field='themes'))
 870
 871         if self._books != []:
 872             bf = BooleanFilter()
 873             for b in self._books:
 874                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 875                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 876             fs.append(bf)
 877
 878         return Search.chain_filters(fs)
 879
 880     def should_search_for_book(self):
 881         return self._books == []
 882
 883     def just_search_in(self, all):
 884         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 885         some = []
 886         for field in all:
 887             if field == 'authors' and 'author' in self.book_tags:
 888                 continue
 889             if field == 'title' and self._books != []:
 890                 continue
 891             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 892                 continue
 893             some.append(field)
 894         return some
 895
 896
 897 class Search(IndexStore):
 898     """
 899     Search facilities.
 900     """
 901     def __init__(self, default_field="content"):
 902         IndexStore.__init__(self)
 903         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 904         # self.analyzer = WLAnalyzer()
 905         self.searcher = IndexSearcher(self.store, True)
 906         self.parser = QueryParser(Version.LUCENE_34, default_field,
 907                                   self.analyzer)
 908
 909         self.parent_filter = TermsFilter()
 910         self.parent_filter.addTerm(Term("is_book", "true"))
 911
 912     def query(self, query):
 913         """Parse query in default Lucene Syntax. (for humans)
 914         """
 915         return self.parser.parse(query)
 916
 917     def simple_search(self, query, max_results=50):
 918         """Runs a query for books using lucene syntax. (for humans)
 919         Returns (books, total_hits)
 920         """
 921
 922         tops = self.searcher.search(self.query(query), max_results)
 923         bks = []
 924         for found in tops.scoreDocs:
 925             doc = self.searcher.doc(found.doc)
 926             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 927         return (bks, tops.totalHits)
 928
 929     def get_tokens(self, searched, field='content', cached=None):
 930         """returns tokens analyzed by a proper (for a field) analyzer
 931         argument can be: StringReader, string/unicode, or tokens. In the last case
 932         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 933         """
 934         if cached is not None and field in cached:
 935             return cached[field]
 936
 937         if isinstance(searched, str) or isinstance(searched, unicode):
 938             searched = StringReader(searched)
 939         elif isinstance(searched, list):
 940             return searched
 941
 942         searched.reset()
 943         tokens = self.analyzer.reusableTokenStream(field, searched)
 944         toks = []
 945         while tokens.incrementToken():
 946             cta = tokens.getAttribute(CharTermAttribute.class_)
 947             toks.append(cta.toString())
 948
 949         if cached is not None:
 950             cached[field] = toks
 951
 952         return toks
 953
 954     def fuzziness(self, fuzzy):
 955         """Helper method to sanitize fuzziness"""
 956         if not fuzzy:
 957             return None
 958         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 959             return fuzzy
 960         else:
 961             return 0.5
 962
 963     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 964         """
 965         Return a PhraseQuery with a series of tokens.
 966         """
 967         if fuzzy:
 968             phrase = MultiPhraseQuery()
 969             for t in tokens:
 970                 term = Term(field, t)
 971                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 972                 fuzzterms = []
 973
 974                 while True:
 975                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 976                     ft = fuzzterm.term()
 977                     if ft:
 978                         fuzzterms.append(ft)
 979                     if not fuzzterm.next(): break
 980                 if fuzzterms:
 981                     phrase.add(JArray('object')(fuzzterms, Term))
 982                 else:
 983                     phrase.add(term)
 984         else:
 985             phrase = PhraseQuery()
 986             phrase.setSlop(slop)
 987             for t in tokens:
 988                 term = Term(field, t)
 989                 phrase.add(term)
 990         return phrase
 991
 992     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 993         """
 994         Returns term queries joined by boolean query.
 995         modal - applies to boolean query
 996         fuzzy - should the query by fuzzy.
 997         """
 998         q = BooleanQuery()
 999         for t in tokens:
1000             term = Term(field, t)
1001             if fuzzy:
1002                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1003             else:
1004                 term = TermQuery(term)
1005             q.add(BooleanClause(term, modal))
1006         return q
1007
1008     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1009                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1010         if filters is None: filters = []
1011         if tokens_cache is None: tokens_cache = {}
1012
1013         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1014
1015         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1016         if book:
1017             filters.append(self.term_filter(Term('is_book', 'true')))
1018         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1019
1020         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1021
1022     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1023                     filters=None, tokens_cache=None, boost=None, snippets=True):
1024         if filters is None: filters = []
1025         if tokens_cache is None: tokens_cache = {}
1026
1027         if book:
1028             filters.append(self.term_filter(Term('is_book', 'true')))
1029
1030         query = BooleanQuery()
1031
1032         for fld in fields:
1033             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1034
1035             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1036                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1037
1038         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1039
1040         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1041                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1042
1043     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1044         """
1045         Search for perfect book matches. Just see if the query matches with some author or title,
1046         taking hints into account.
1047         """
1048         fields_to_search = ['authors', 'title']
1049         only_in = None
1050         if hint:
1051             if not hint.should_search_for_book():
1052                 return []
1053             fields_to_search = hint.just_search_in(fields_to_search)
1054             only_in = hint.book_filter()
1055
1056         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1057
1058         books = []
1059         for q in qrys:
1060             top = self.searcher.search(q,
1061                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1062                 max_results)
1063             for found in top.scoreDocs:
1064                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1065         return books
1066
1067     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1068         fields_to_search = ['tags', 'authors', 'title']
1069
1070         only_in = None
1071         if hint:
1072             if not hint.should_search_for_book():
1073                 return []
1074             fields_to_search = hint.just_search_in(fields_to_search)
1075             only_in = hint.book_filter()
1076
1077         tokens = self.get_tokens(searched, field='SIMPLE')
1078
1079         q = BooleanQuery()
1080
1081         for fld in fields_to_search:
1082             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1083                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1084
1085         books = []
1086         top = self.searcher.search(q,
1087                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1088             max_results)
1089         for found in top.scoreDocs:
1090             books.append(SearchResult(self, found, how_found="search_book"))
1091
1092         return books
1093
1094     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1095         """
1096         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1097         some part/fragment of the book.
1098         """
1099         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1100
1101         flt = None
1102         if hint:
1103             flt = hint.part_filter()
1104
1105         books = []
1106         for q in qrys:
1107             top = self.searcher.search(q,
1108                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1109                                                            flt]),
1110                                        max_results)
1111             for found in top.scoreDocs:
1112                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1113
1114         return books
1115
1116     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1117         """
1118         Tries to use search terms to match different fields of book (or its parts).
1119         E.g. one word can be an author survey, another be a part of the title, and the rest
1120         are some words from third chapter.
1121         """
1122         if tokens_cache is None: tokens_cache = {}
1123         books = []
1124         only_in = None
1125
1126         if hint:
1127             only_in = hint.part_filter()
1128
1129         # content only query : themes x content
1130         q = BooleanQuery()
1131
1132         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1133         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1134
1135         # only search in themes when we do not already filter by themes
1136         if hint is None or hint.just_search_in(['themes']) != []:
1137             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1138                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1139
1140         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1141                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1142
1143         topDocs = self.searcher.search(q, only_in, max_results)
1144         for found in topDocs.scoreDocs:
1145             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1146             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1147
1148         # query themes/content x author/title/tags
1149         q = BooleanQuery()
1150         in_content = BooleanQuery()
1151         in_meta = BooleanQuery()
1152
1153         for fld in ['themes_pl', 'content']:
1154             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1155
1156         for fld in ['tags', 'authors', 'title']:
1157             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1158
1159         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1160         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1161
1162         topDocs = self.searcher.search(q, only_in, max_results)
1163         for found in topDocs.scoreDocs:
1164             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1165             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1166
1167         return books
1168
1169     # def multisearch(self, query, max_results=50):
1170     #     """
1171     #     Search strategy:
1172     #     - (phrase) OR -> content
1173     #                   -> title
1174     #                   -> authors
1175     #     - (keywords)  -> authors
1176     #                   -> motyw
1177     #                   -> tags
1178     #                   -> content
1179     #     """
1180         # queryreader = StringReader(query)
1181         # tokens = self.get_tokens(queryreader)
1182
1183         # top_level = BooleanQuery()
1184         # Should = BooleanClause.Occur.SHOULD
1185
1186         # phrase_level = BooleanQuery()
1187         # phrase_level.setBoost(1.3)
1188
1189         # p_content = self.make_phrase(tokens, joined=True)
1190         # p_title = self.make_phrase(tokens, 'title')
1191         # p_author = self.make_phrase(tokens, 'author')
1192
1193         # phrase_level.add(BooleanClause(p_content, Should))
1194         # phrase_level.add(BooleanClause(p_title, Should))
1195         # phrase_level.add(BooleanClause(p_author, Should))
1196
1197         # kw_level = BooleanQuery()
1198
1199         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1200         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1201         # kw_level.add(j_themes, Should)
1202         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1203         # j_con = self.make_term_query(tokens, joined=True)
1204         # kw_level.add(j_con, Should)
1205
1206         # top_level.add(BooleanClause(phrase_level, Should))
1207         # top_level.add(BooleanClause(kw_level, Should))
1208
1209         # return None
1210
1211     def get_snippets(self, scoreDoc, query, field='content'):
1212         """
1213         Returns a snippet for found scoreDoc.
1214         """
1215         htmlFormatter = SimpleHTMLFormatter()
1216         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1217
1218         stored = self.searcher.doc(scoreDoc.doc)
1219
1220         position = stored.get('snippets_position')
1221         length = stored.get('snippets_length')
1222         if position is None or length is None:
1223             return None
1224         # locate content.
1225         book_id = int(stored.get('book_id'))
1226         snippets = Snippets(book_id).open()
1227         try:
1228             try:
1229                 text = snippets.get((int(position),
1230                                      int(length)))
1231             finally:
1232                 snippets.close()
1233
1234             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1235             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1236             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1237
1238         except Exception, e:
1239             e2 = e
1240             if hasattr(e, 'getJavaException'):
1241                 e2 = unicode(e.getJavaException())
1242             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1243                 e2)
1244         return snip
1245
1246     @staticmethod
1247     def enum_to_array(enum):
1248         """
1249         Converts a lucene TermEnum to array of Terms, suitable for
1250         addition to queries
1251         """
1252         terms = []
1253
1254         while True:
1255             t = enum.term()
1256             if t:
1257                 terms.append(t)
1258             if not enum.next(): break
1259
1260         if terms:
1261             return JArray('object')(terms, Term)
1262
1263     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1264         """
1265         Search for Tag objects using query.
1266         """
1267         if not pdcounter:
1268             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1269         tops = self.searcher.search(query, filters, max_results)
1270
1271         tags = []
1272         for found in tops.scoreDocs:
1273             doc = self.searcher.doc(found.doc)
1274             is_pdcounter = doc.get('is_pdcounter')
1275             category = doc.get('tag_category')
1276             if is_pdcounter == 'true':
1277                 if category == 'pd_author':
1278                     tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1279                 elif category == 'pd_book':
1280                     tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1281                     tag.category = 'pd_book'  # make it look more lik a tag.
1282                 else:
1283                     print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1284             else:
1285                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1286                 # don't add the pdcounter tag if same tag already exists
1287             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1288                 tags.append(tag)
1289                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1290         print 'returning %s' % tags
1291         return tags
1292
1293     def search_books(self, query, filter=None, max_results=10):
1294         """
1295         Searches for Book objects using query
1296         """
1297         bks = []
1298         tops = self.searcher.search(query, filter, max_results)
1299         for found in tops.scoreDocs:
1300             doc = self.searcher.doc(found.doc)
1301             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1302         return bks
1303
1304     def make_prefix_phrase(self, toks, field):
1305         q = MultiPhraseQuery()
1306         for i in range(len(toks)):
1307             t = Term(field, toks[i])
1308             if i == len(toks) - 1:
1309                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1310                 if pterms:
1311                     q.add(pterms)
1312                 else:
1313                     q.add(t)
1314             else:
1315                 q.add(t)
1316         return q
1317
1318     @staticmethod
1319     def term_filter(term, inverse=False):
1320         only_term = TermsFilter()
1321         only_term.addTerm(term)
1322
1323         if inverse:
1324             neg = BooleanFilter()
1325             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1326             only_term = neg
1327
1328         return only_term
1329
1330     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1331         """
1332         Return auto-complete hints for tags
1333         using prefix search.
1334         """
1335         toks = self.get_tokens(string, field='SIMPLE')
1336         top = BooleanQuery()
1337
1338         for field in ['tag_name', 'tag_name_pl']:
1339             if prefix:
1340                 q = self.make_prefix_phrase(toks, field)
1341             else:
1342                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1343             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1344
1345         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1346
1347         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1348
1349     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1350         """
1351         Returns auto-complete hints for book titles
1352         Because we do not index 'pseudo' title-tags.
1353         Prefix search.
1354         """
1355         toks = self.get_tokens(string, field='SIMPLE')
1356
1357         if prefix:
1358             q = self.make_prefix_phrase(toks, 'title')
1359         else:
1360             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1361
1362         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1363
1364     @staticmethod
1365     def chain_filters(filters, op=ChainedFilter.AND):
1366         """
1367         Chains a filter list together
1368         """
1369         filters = filter(lambda x: x is not None, filters)
1370         if not filters or filters is []:
1371             return None
1372         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1373         return chf
1374
1375     def filtered_categories(self, tags):
1376         """
1377         Return a list of tag categories, present in tags list.
1378         """
1379         cats = {}
1380         for t in tags:
1381             cats[t.category] = True
1382         return cats.keys()
1383
1384     def hint(self):
1385         return Hint(self)