apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, analyzer=None, timeout=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         conf = IndexWriterConfig(Version.LUCENE_34, analyzer)
 176         if timeout:
 177             conf.setWriteLockTimeout(long(timeout))
 178         self.index = IndexWriter(self.store, conf)
 179         return self.index
 180
 181     def optimize(self):
 182         self.index.optimize()
 183
 184     def close(self):
 185         try:
 186             self.index.optimize()
 187         except JavaError, je:
 188             print "Error during optimize phase, check index: %s" % je
 189
 190         self.index.close()
 191         self.index = None
 192
 193     def __enter__(self):
 194         self.open()
 195         return self
 196
 197     def __exit__(self, type, value, tb):
 198         self.close()
 199
 200
 201 class Index(BaseIndex):
 202     """
 203     Class indexing books.
 204     """
 205     def __init__(self, analyzer=None):
 206         super(Index, self).__init__(analyzer)
 207
 208     def index_tags(self):
 209         """
 210         Re-index global tag list.
 211         Removes all tags from index, then index them again.
 212         Indexed fields include: id, name (with and without polish stems), category
 213         """
 214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 215         self.index.deleteDocuments(q)
 216
 217         for tag in catalogue.models.Tag.objects.all():
 218             doc = Document()
 219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 223             self.index.addDocument(doc)
 224
 225         for pdtag in PDCounterAuthor.objects.all():
 226             doc = Document()
 227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 230             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
 231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 232             self.index.addDocument(doc)
 233
 234     def create_book_doc(self, book):
 235         """
 236         Create a lucene document referring book id.
 237         """
 238         doc = Document()
 239         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 240         if book.parent is not None:
 241             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 242         return doc
 243
 244     def remove_book(self, book):
 245         """Removes a book from search index.
 246         book - Book instance."""
 247         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 248         self.index.deleteDocuments(q)
 249
 250     def index_book(self, book, book_info=None, overwrite=True):
 251         """
 252         Indexes the book.
 253         Creates a lucene document for extracted metadata
 254         and calls self.index_content() to index the contents of the book.
 255         """
 256         if overwrite:
 257             self.remove_book(book)
 258
 259         book_doc = self.create_book_doc(book)
 260         meta_fields = self.extract_metadata(book, book_info)
 261         for f in meta_fields.values():
 262             if isinstance(f, list) or isinstance(f, tuple):
 263                 for elem in f:
 264                     book_doc.add(elem)
 265             else:
 266                 book_doc.add(f)
 267
 268         self.index.addDocument(book_doc)
 269         del book_doc
 270
 271         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 272
 273     master_tags = [
 274         'opowiadanie',
 275         'powiesc',
 276         'dramat_wierszowany_l',
 277         'dramat_wierszowany_lp',
 278         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 279         'wywiad',
 280         ]
 281
 282     ignore_content_tags = [
 283         'uwaga', 'extra',
 284         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 285         'didaskalia',
 286         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 287         ]
 288
 289     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 290
 291     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 292
 293     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 294
 295     def extract_metadata(self, book, book_info=None):
 296         """
 297         Extract metadata from book and returns a map of fields keyed by fieldname
 298         """
 299         fields = {}
 300
 301         if book_info is None:
 302             book_info = dcparser.parse(open(book.xml_file.path))
 303
 304         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 305         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 306         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 307
 308         # validator, name
 309         for field in dcparser.BookInfo.FIELDS:
 310             if hasattr(book_info, field.name):
 311                 if not getattr(book_info, field.name):
 312                     continue
 313                 # since no type information is available, we use validator
 314                 type_indicator = field.validator
 315                 if type_indicator == dcparser.as_unicode:
 316                     s = getattr(book_info, field.name)
 317                     if field.multiple:
 318                         s = ', '.join(s)
 319                     try:
 320                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 321                     except JavaError as je:
 322                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 323                 elif type_indicator == dcparser.as_person:
 324                     p = getattr(book_info, field.name)
 325                     if isinstance(p, dcparser.Person):
 326                         persons = unicode(p)
 327                     else:
 328                         persons = ', '.join(map(unicode, p))
 329                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 330                 elif type_indicator == dcparser.as_date:
 331                     dt = getattr(book_info, field.name)
 332                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 333                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 334
 335         # get published date
 336         pd = None
 337         if hasattr(book_info, 'source_name') and book_info.source_name:
 338             match = self.published_date_re.search(book_info.source_name)
 339             if match is not None:
 340                 pd = str(match.groups()[0])
 341         if not pd: pd = ""
 342         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 343
 344         return fields
 345
 346     def add_gaps(self, fields, fieldname):
 347         """
 348         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 349         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 350         """
 351         def gap():
 352             while True:
 353                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 354         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 355
 356     def get_master(self, root):
 357         """
 358         Returns the first master tag from an etree.
 359         """
 360         for master in root.iter():
 361             if master.tag in self.master_tags:
 362                 return master
 363
 364     def index_content(self, book, book_fields=[]):
 365         """
 366         Walks the book XML and extract content from it.
 367         Adds parts for each header tag and for each fragment.
 368         """
 369         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 370         root = wld.edoc.getroot()
 371
 372         master = self.get_master(root)
 373         if master is None:
 374             return []
 375
 376         def walker(node, ignore_tags=[]):
 377
 378             if node.tag not in ignore_tags:
 379                 yield node, None, None
 380                 if node.text is not None:
 381                     yield None, node.text, None
 382                 for child in list(node):
 383                     for b, t, e in walker(child):
 384                         yield b, t, e
 385                 yield None, None, node
 386
 387             if node.tail is not None:
 388                 yield None, node.tail, None
 389             return
 390
 391         def fix_format(text):
 392             #            separator = [u" ", u"\t", u".", u";", u","]
 393             if isinstance(text, list):
 394                 # need to join it first
 395                 text = filter(lambda s: s is not None, content)
 396                 text = u' '.join(text)
 397                 # for i in range(len(text)):
 398                 #     if i > 0:
 399                 #         if text[i][0] not in separator\
 400                 #             and text[i - 1][-1] not in separator:
 401                 #          text.insert(i, u" ")
 402
 403             return re.sub("(?m)/$", "", text)
 404
 405         def add_part(snippets, **fields):
 406             doc = self.create_book_doc(book)
 407             for f in book_fields:
 408                 doc.add(f)
 409
 410             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 411             doc.add(NumericField("header_span", Field.Store.YES, True)\
 412                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 413             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 414
 415             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 416                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 417
 418             snip_pos = snippets.add(fields["content"])
 419             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 420             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 421
 422             if 'fragment_anchor' in fields:
 423                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 424                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 425
 426             if 'themes' in fields:
 427                 themes, themes_pl = zip(*[
 428                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 429                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 430                      for theme in fields['themes']])
 431
 432                 themes = self.add_gaps(themes, 'themes')
 433                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 434
 435                 for t in themes:
 436                     doc.add(t)
 437                 for t in themes_pl:
 438                     doc.add(t)
 439
 440             return doc
 441
 442         def give_me_utf8(s):
 443             if isinstance(s, unicode):
 444                 return s.encode('utf-8')
 445             else:
 446                 return s
 447
 448         fragments = {}
 449         snippets = Snippets(book.id).open('w')
 450         try:
 451             for header, position in zip(list(master), range(len(master))):
 452
 453                 if header.tag in self.skip_header_tags:
 454                     continue
 455                 if header.tag is etree.Comment:
 456                     continue
 457
 458                 # section content
 459                 content = []
 460                 footnote = []
 461
 462                 def all_content(text):
 463                     for frag in fragments.values():
 464                         frag['content'].append(text)
 465                     content.append(text)
 466                 handle_text = [all_content]
 467
 468
 469                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 470                     # handle footnotes
 471                     if start is not None and start.tag in self.footnote_tags:
 472                         footnote = []
 473                         def collect_footnote(t):
 474                             footnote.append(t)
 475                         handle_text.append(collect_footnote)
 476                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 477                         handle_text.pop()
 478                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 479                                        content=u''.join(footnote),
 480                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 481
 482                         self.index.addDocument(doc)
 483                         #print "@ footnote text: %s" % footnote
 484                         footnote = []
 485
 486                     # handle fragments and themes.
 487                     if start is not None and start.tag == 'begin':
 488                         fid = start.attrib['id'][1:]
 489                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 490
 491                     # themes for this fragment
 492                     elif start is not None and start.tag == 'motyw':
 493                         fid = start.attrib['id'][1:]
 494                         handle_text.append(None)
 495                         if start.text is not None:
 496                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 497                     elif end is not None and end.tag == 'motyw':
 498                         handle_text.pop()
 499
 500                     elif start is not None and start.tag == 'end':
 501                         fid = start.attrib['id'][1:]
 502                         if fid not in fragments:
 503                             continue  # a broken <end> node, skip it
 504                         frag = fragments[fid]
 505                         if frag['themes'] == []:
 506                             continue  # empty themes list.
 507                         del fragments[fid]
 508
 509                         doc = add_part(snippets,
 510                                        header_type=frag['start_header'],
 511                                        header_index=frag['start_section'],
 512                                        header_span=position - frag['start_section'] + 1,
 513                                        fragment_anchor=fid,
 514                                        content=fix_format(frag['content']),
 515                                        themes=frag['themes'])
 516                         #print '@ FRAG %s' % frag['content']
 517                         self.index.addDocument(doc)
 518
 519                         # Collect content.
 520
 521                     if text is not None and handle_text is not []:
 522                         hdl = handle_text[-1]
 523                         if hdl is not None:
 524                             hdl(text)
 525
 526                         # in the end, add a section text.
 527                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 528                                content=fix_format(content))
 529                 #print '@ CONTENT: %s' % fix_format(content)
 530
 531                 self.index.addDocument(doc)
 532
 533         finally:
 534             snippets.close()
 535
 536
 537 def log_exception_wrapper(f):
 538     def _wrap(*a):
 539         try:
 540             f(*a)
 541         except Exception, e:
 542             print("Error in indexing thread: %s" % e)
 543             traceback.print_exc()
 544             raise e
 545     return _wrap
 546
 547
 548 class ReusableIndex(Index):
 549     """
 550     Works like index, but does not close/optimize Lucene index
 551     until program exit (uses atexit hook).
 552     This is usefull for importbooks command.
 553
 554     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 555     """
 556     index = None
 557
 558     def open(self, analyzer=None, **kw):
 559         if ReusableIndex.index:
 560             self.index = ReusableIndex.index
 561         else:
 562             print("opening index")
 563             Index.open(self, analyzer, **kw)
 564             ReusableIndex.index = self.index
 565             atexit.register(ReusableIndex.close_reusable)
 566
 567     # def index_book(self, *args, **kw):
 568     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 569     #     ReusableIndex.pool_jobs.append(job)
 570
 571     @staticmethod
 572     def close_reusable():
 573         if ReusableIndex.index:
 574             print("closing index")
 575             ReusableIndex.index.optimize()
 576             ReusableIndex.index.close()
 577             ReusableIndex.index = None
 578
 579     def close(self):
 580         if ReusableIndex.index:
 581             ReusableIndex.index.commit()
 582
 583
 584 class JoinSearch(object):
 585     """
 586     This mixin could be used to handle block join queries.
 587     (currently unused)
 588     """
 589     def __init__(self, *args, **kw):
 590         super(JoinSearch, self).__init__(*args, **kw)
 591
 592     def wrapjoins(self, query, fields=[]):
 593         """
 594         This functions modifies the query in a recursive way,
 595         so Term and Phrase Queries contained, which match
 596         provided fields are wrapped in a BlockJoinQuery,
 597         and so delegated to children documents.
 598         """
 599         if BooleanQuery.instance_(query):
 600             qs = BooleanQuery.cast_(query)
 601             for clause in qs:
 602                 clause = BooleanClause.cast_(clause)
 603                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 604             return qs
 605         else:
 606             termset = HashSet()
 607             query.extractTerms(termset)
 608             for t in termset:
 609                 t = Term.cast_(t)
 610                 if t.field() not in fields:
 611                     return query
 612             return BlockJoinQuery(query, self.parent_filter,
 613                                   BlockJoinQuery.ScoreMode.Total)
 614
 615     def bsearch(self, query, max_results=50):
 616         q = self.query(query)
 617         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 618
 619         tops = self.searcher.search(bjq, max_results)
 620         bks = []
 621         for found in tops.scoreDocs:
 622             doc = self.searcher.doc(found.doc)
 623             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 624         return (bks, tops.totalHits)
 625
 626
 627 class SearchResult(object):
 628     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 629         if tokens_cache is None: tokens_cache = {}
 630
 631         if score:
 632             self._score = score
 633         else:
 634             self._score = scoreDocs.score
 635
 636         self.boost = 1.0
 637
 638         self._hits = []
 639         self._processed_hits = None  # processed hits
 640
 641         stored = search.searcher.doc(scoreDocs.doc)
 642         self.book_id = int(stored.get("book_id"))
 643
 644         pd = stored.get("published_date")
 645         if pd is None:
 646             pd = 0
 647         self.published_date = int(pd)
 648
 649         header_type = stored.get("header_type")
 650         # we have a content hit in some header of fragment
 651         if header_type is not None:
 652             sec = (header_type, int(stored.get("header_index")))
 653             header_span = stored.get('header_span')
 654             header_span = header_span is not None and int(header_span) or 1
 655
 656             fragment = stored.get("fragment_anchor")
 657
 658             if snippets:
 659                 snippets = snippets.replace("/\n", "\n")
 660             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 661
 662             self._hits.append(hit)
 663
 664         self.search = search
 665         self.searched = searched
 666         self.tokens_cache = tokens_cache
 667
 668     @property
 669     def score(self):
 670         return self._score * self.boost
 671
 672     def merge(self, other):
 673         if self.book_id != other.book_id:
 674             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 675         self._hits += other._hits
 676         if other.score > self.score:
 677             self._score = other._score
 678         return self
 679
 680     def get_book(self):
 681         return catalogue.models.Book.objects.get(id=self.book_id)
 682
 683     book = property(get_book)
 684
 685     @property
 686     def hits(self):
 687         if self._processed_hits is not None:
 688             return self._processed_hits
 689
 690         POSITION = 0
 691         FRAGMENT = 1
 692         POSITION_INDEX = 1
 693         POSITION_SPAN = 2
 694         SCORE = 2
 695         OTHER = 3
 696
 697         # to sections and fragments
 698         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 699         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 700         sect = filter(lambda s: 0 == len(filter(
 701             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 702             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 703             frags)), sect)
 704
 705         hits = []
 706
 707         # remove duplicate fragments
 708         fragments = {}
 709         for f in frags:
 710             fid = f[FRAGMENT]
 711             if fid in fragments:
 712                 if fragments[fid][SCORE] >= f[SCORE]:
 713                     continue
 714             fragments[fid] = f
 715         frags = fragments.values()
 716
 717         # remove duplicate sections
 718         sections = {}
 719
 720         for s in sect:
 721             si = s[POSITION][POSITION_INDEX]
 722             # skip existing
 723             if si in sections:
 724                 if sections[si]['score'] >= s[SCORE]:
 725                     continue
 726
 727             m = {'score': s[SCORE],
 728                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 729                  }
 730             m.update(s[OTHER])
 731             sections[si] = m
 732
 733         hits = sections.values()
 734
 735         for f in frags:
 736             try:
 737                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 738             except catalogue.models.Fragment.DoesNotExist:
 739                 # stale index
 740                 continue
 741
 742             # Figure out if we were searching for a token matching some word in theme name.
 743             themes = frag.tags.filter(category='theme')
 744             themes_hit = []
 745             if self.searched is not None:
 746                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 747                 for theme in themes:
 748                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 749                     for t in tokens:
 750                         if t in name_tokens:
 751                             if not theme in themes_hit:
 752                                 themes_hit.append(theme)
 753                             break
 754
 755             m = {'score': f[SCORE],
 756                  'fragment': frag,
 757                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 758                  'themes': themes,
 759                  'themes_hit': themes_hit
 760                  }
 761             m.update(f[OTHER])
 762             hits.append(m)
 763
 764         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 765
 766         self._processed_hits = hits
 767
 768         return hits
 769
 770     def __unicode__(self):
 771         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 772
 773     @staticmethod
 774     def aggregate(*result_lists):
 775         books = {}
 776         for rl in result_lists:
 777             for r in rl:
 778                 if r.book_id in books:
 779                     books[r.book_id].merge(r)
 780                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 781                 else:
 782                     books[r.book_id] = r
 783         return books.values()
 784
 785     def __cmp__(self, other):
 786         c = cmp(self.score, other.score)
 787         if c == 0:
 788             # this is inverted, because earlier date is better
 789             return cmp(other.published_date, self.published_date)
 790         else:
 791             return c
 792
 793
 794 class Hint(object):
 795     """
 796     Given some hint information (information we already know about)
 797     our search target - like author, title (specific book), epoch, genre, kind
 798     we can narrow down search using filters.
 799     """
 800     def __init__(self, search):
 801         """
 802         Accepts a Searcher instance.
 803         """
 804         self.search = search
 805         self.book_tags = {}
 806         self.part_tags = []
 807         self._books = []
 808
 809     def books(self, *books):
 810         """
 811         Give a hint that we search these books.
 812         """
 813         self._books = books
 814
 815     def tags(self, tags):
 816         """
 817         Give a hint that these Tag objects (a list of)
 818         is necessary.
 819         """
 820         for t in tags:
 821             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 822                 lst = self.book_tags.get(t.category, [])
 823                 lst.append(t)
 824                 self.book_tags[t.category] = lst
 825             if t.category in ['theme', 'theme_pl']:
 826                 self.part_tags.append(t)
 827
 828     def tag_filter(self, tags, field='tags'):
 829         """
 830         Given a lsit of tags and an optional field (but they are normally in tags field)
 831         returns a filter accepting only books with specific tags.
 832         """
 833         q = BooleanQuery()
 834
 835         for tag in tags:
 836             toks = self.search.get_tokens(tag.name, field=field)
 837             tag_phrase = PhraseQuery()
 838             for tok in toks:
 839                 tag_phrase.add(Term(field, tok))
 840             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 841
 842         return QueryWrapperFilter(q)
 843
 844     def book_filter(self):
 845         """
 846         Filters using book tags (all tag kinds except a theme)
 847         """
 848         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 849         if tags:
 850             return self.tag_filter(tags)
 851         else:
 852             return None
 853
 854     def part_filter(self):
 855         """
 856         This filter can be used to look for book parts.
 857         It filters on book id and/or themes.
 858         """
 859         fs = []
 860         if self.part_tags:
 861             fs.append(self.tag_filter(self.part_tags, field='themes'))
 862
 863         if self._books != []:
 864             bf = BooleanFilter()
 865             for b in self._books:
 866                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 867                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 868             fs.append(bf)
 869
 870         return Search.chain_filters(fs)
 871
 872     def should_search_for_book(self):
 873         return self._books == []
 874
 875     def just_search_in(self, all):
 876         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 877         some = []
 878         for field in all:
 879             if field == 'authors' and 'author' in self.book_tags:
 880                 continue
 881             if field == 'title' and self._books != []:
 882                 continue
 883             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 884                 continue
 885             some.append(field)
 886         return some
 887
 888
 889 class Search(IndexStore):
 890     """
 891     Search facilities.
 892     """
 893     def __init__(self, default_field="content"):
 894         IndexStore.__init__(self)
 895         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 896         # self.analyzer = WLAnalyzer()
 897         self.searcher = IndexSearcher(self.store, True)
 898         self.parser = QueryParser(Version.LUCENE_34, default_field,
 899                                   self.analyzer)
 900
 901         self.parent_filter = TermsFilter()
 902         self.parent_filter.addTerm(Term("is_book", "true"))
 903
 904     def query(self, query):
 905         """Parse query in default Lucene Syntax. (for humans)
 906         """
 907         return self.parser.parse(query)
 908
 909     def simple_search(self, query, max_results=50):
 910         """Runs a query for books using lucene syntax. (for humans)
 911         Returns (books, total_hits)
 912         """
 913
 914         tops = self.searcher.search(self.query(query), max_results)
 915         bks = []
 916         for found in tops.scoreDocs:
 917             doc = self.searcher.doc(found.doc)
 918             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 919         return (bks, tops.totalHits)
 920
 921     def get_tokens(self, searched, field='content', cached=None):
 922         """returns tokens analyzed by a proper (for a field) analyzer
 923         argument can be: StringReader, string/unicode, or tokens. In the last case
 924         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 925         """
 926         if cached is not None and field in cached:
 927             return cached[field]
 928
 929         if isinstance(searched, str) or isinstance(searched, unicode):
 930             searched = StringReader(searched)
 931         elif isinstance(searched, list):
 932             return searched
 933
 934         searched.reset()
 935         tokens = self.analyzer.reusableTokenStream(field, searched)
 936         toks = []
 937         while tokens.incrementToken():
 938             cta = tokens.getAttribute(CharTermAttribute.class_)
 939             toks.append(cta.toString())
 940
 941         if cached is not None:
 942             cached[field] = toks
 943
 944         return toks
 945
 946     def fuzziness(self, fuzzy):
 947         """Helper method to sanitize fuzziness"""
 948         if not fuzzy:
 949             return None
 950         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 951             return fuzzy
 952         else:
 953             return 0.5
 954
 955     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 956         """
 957         Return a PhraseQuery with a series of tokens.
 958         """
 959         if fuzzy:
 960             phrase = MultiPhraseQuery()
 961             for t in tokens:
 962                 term = Term(field, t)
 963                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 964                 fuzzterms = []
 965
 966                 while True:
 967                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 968                     ft = fuzzterm.term()
 969                     if ft:
 970                         fuzzterms.append(ft)
 971                     if not fuzzterm.next(): break
 972                 if fuzzterms:
 973                     phrase.add(JArray('object')(fuzzterms, Term))
 974                 else:
 975                     phrase.add(term)
 976         else:
 977             phrase = PhraseQuery()
 978             phrase.setSlop(slop)
 979             for t in tokens:
 980                 term = Term(field, t)
 981                 phrase.add(term)
 982         return phrase
 983
 984     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 985         """
 986         Returns term queries joined by boolean query.
 987         modal - applies to boolean query
 988         fuzzy - should the query by fuzzy.
 989         """
 990         q = BooleanQuery()
 991         for t in tokens:
 992             term = Term(field, t)
 993             if fuzzy:
 994                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 995             else:
 996                 term = TermQuery(term)
 997             q.add(BooleanClause(term, modal))
 998         return q
 999
1000     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1001                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1002         if filters is None: filters = []
1003         if tokens_cache is None: tokens_cache = {}
1004
1005         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1006
1007         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1008         if book:
1009             filters.append(self.term_filter(Term('is_book', 'true')))
1010         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1011
1012         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1013
1014     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1015                     filters=None, tokens_cache=None, boost=None, snippets=True):
1016         if filters is None: filters = []
1017         if tokens_cache is None: tokens_cache = {}
1018
1019         if book:
1020             filters.append(self.term_filter(Term('is_book', 'true')))
1021
1022         query = BooleanQuery()
1023
1024         for fld in fields:
1025             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1026
1027             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1028                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1029
1030         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1031
1032         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1033                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1034
1035     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1036         """
1037         Search for perfect book matches. Just see if the query matches with some author or title,
1038         taking hints into account.
1039         """
1040         fields_to_search = ['authors', 'title']
1041         only_in = None
1042         if hint:
1043             if not hint.should_search_for_book():
1044                 return []
1045             fields_to_search = hint.just_search_in(fields_to_search)
1046             only_in = hint.book_filter()
1047
1048         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1049
1050         books = []
1051         for q in qrys:
1052             top = self.searcher.search(q,
1053                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1054                 max_results)
1055             for found in top.scoreDocs:
1056                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1057         return books
1058
1059     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1060         fields_to_search = ['tags', 'authors', 'title']
1061
1062         only_in = None
1063         if hint:
1064             if not hint.should_search_for_book():
1065                 return []
1066             fields_to_search = hint.just_search_in(fields_to_search)
1067             only_in = hint.book_filter()
1068
1069         tokens = self.get_tokens(searched, field='SIMPLE')
1070
1071         q = BooleanQuery()
1072
1073         for fld in fields_to_search:
1074             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1075                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1076
1077         books = []
1078         top = self.searcher.search(q,
1079                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1080             max_results)
1081         for found in top.scoreDocs:
1082             books.append(SearchResult(self, found, how_found="search_book"))
1083
1084         return books
1085
1086     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1087         """
1088         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1089         some part/fragment of the book.
1090         """
1091         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1092
1093         flt = None
1094         if hint:
1095             flt = hint.part_filter()
1096
1097         books = []
1098         for q in qrys:
1099             top = self.searcher.search(q,
1100                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1101                                                            flt]),
1102                                        max_results)
1103             for found in top.scoreDocs:
1104                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1105
1106         return books
1107
1108     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1109         """
1110         Tries to use search terms to match different fields of book (or its parts).
1111         E.g. one word can be an author survey, another be a part of the title, and the rest
1112         are some words from third chapter.
1113         """
1114         if tokens_cache is None: tokens_cache = {}
1115         books = []
1116         only_in = None
1117
1118         if hint:
1119             only_in = hint.part_filter()
1120
1121         # content only query : themes x content
1122         q = BooleanQuery()
1123
1124         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1125         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1126
1127         # only search in themes when we do not already filter by themes
1128         if hint is None or hint.just_search_in(['themes']) != []:
1129             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1130                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1131
1132         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1133                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1134
1135         topDocs = self.searcher.search(q, only_in, max_results)
1136         for found in topDocs.scoreDocs:
1137             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1138             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1139
1140         # query themes/content x author/title/tags
1141         q = BooleanQuery()
1142         in_content = BooleanQuery()
1143         in_meta = BooleanQuery()
1144
1145         for fld in ['themes_pl', 'content']:
1146             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1147
1148         for fld in ['tags', 'authors', 'title']:
1149             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1150
1151         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1152         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1153
1154         topDocs = self.searcher.search(q, only_in, max_results)
1155         for found in topDocs.scoreDocs:
1156             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1157             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1158
1159         return books
1160
1161     # def multisearch(self, query, max_results=50):
1162     #     """
1163     #     Search strategy:
1164     #     - (phrase) OR -> content
1165     #                   -> title
1166     #                   -> authors
1167     #     - (keywords)  -> authors
1168     #                   -> motyw
1169     #                   -> tags
1170     #                   -> content
1171     #     """
1172         # queryreader = StringReader(query)
1173         # tokens = self.get_tokens(queryreader)
1174
1175         # top_level = BooleanQuery()
1176         # Should = BooleanClause.Occur.SHOULD
1177
1178         # phrase_level = BooleanQuery()
1179         # phrase_level.setBoost(1.3)
1180
1181         # p_content = self.make_phrase(tokens, joined=True)
1182         # p_title = self.make_phrase(tokens, 'title')
1183         # p_author = self.make_phrase(tokens, 'author')
1184
1185         # phrase_level.add(BooleanClause(p_content, Should))
1186         # phrase_level.add(BooleanClause(p_title, Should))
1187         # phrase_level.add(BooleanClause(p_author, Should))
1188
1189         # kw_level = BooleanQuery()
1190
1191         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1192         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1193         # kw_level.add(j_themes, Should)
1194         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1195         # j_con = self.make_term_query(tokens, joined=True)
1196         # kw_level.add(j_con, Should)
1197
1198         # top_level.add(BooleanClause(phrase_level, Should))
1199         # top_level.add(BooleanClause(kw_level, Should))
1200
1201         # return None
1202
1203     def get_snippets(self, scoreDoc, query, field='content'):
1204         """
1205         Returns a snippet for found scoreDoc.
1206         """
1207         htmlFormatter = SimpleHTMLFormatter()
1208         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1209
1210         stored = self.searcher.doc(scoreDoc.doc)
1211
1212         position = stored.get('snippets_position')
1213         length = stored.get('snippets_length')
1214         if position is None or length is None:
1215             return None
1216         # locate content.
1217         book_id = int(stored.get('book_id'))
1218         snippets = Snippets(book_id).open()
1219         try:
1220             try:
1221                 text = snippets.get((int(position),
1222                                      int(length)))
1223             finally:
1224                 snippets.close()
1225
1226             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1227             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1228             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1229
1230         except Exception, e:
1231             e2 = e
1232             if hasattr(e, 'getJavaException'):
1233                 e2 = unicode(e.getJavaException())
1234             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1235                 e2)
1236         return snip
1237
1238     @staticmethod
1239     def enum_to_array(enum):
1240         """
1241         Converts a lucene TermEnum to array of Terms, suitable for
1242         addition to queries
1243         """
1244         terms = []
1245
1246         while True:
1247             t = enum.term()
1248             if t:
1249                 terms.append(t)
1250             if not enum.next(): break
1251
1252         if terms:
1253             return JArray('object')(terms, Term)
1254
1255     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1256         """
1257         Search for Tag objects using query.
1258         """
1259         if not pdcounter:
1260             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1261         tops = self.searcher.search(query, filters, max_results)
1262
1263         tags = []
1264         for found in tops.scoreDocs:
1265             doc = self.searcher.doc(found.doc)
1266             is_pdcounter = doc.get('is_pdcounter')
1267             if is_pdcounter:
1268                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1269             else:
1270                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1271                 # don't add the pdcounter tag if same tag already exists
1272             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1273                 tags.append(tag)
1274                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1275         print 'returning %s' % tags
1276         return tags
1277
1278     def search_books(self, query, filter=None, max_results=10):
1279         """
1280         Searches for Book objects using query
1281         """
1282         bks = []
1283         tops = self.searcher.search(query, filter, max_results)
1284         for found in tops.scoreDocs:
1285             doc = self.searcher.doc(found.doc)
1286             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1287         return bks
1288
1289     def make_prefix_phrase(self, toks, field):
1290         q = MultiPhraseQuery()
1291         for i in range(len(toks)):
1292             t = Term(field, toks[i])
1293             if i == len(toks) - 1:
1294                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1295                 if pterms:
1296                     q.add(pterms)
1297                 else:
1298                     q.add(t)
1299             else:
1300                 q.add(t)
1301         return q
1302
1303     @staticmethod
1304     def term_filter(term, inverse=False):
1305         only_term = TermsFilter()
1306         only_term.addTerm(term)
1307
1308         if inverse:
1309             neg = BooleanFilter()
1310             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1311             only_term = neg
1312
1313         return only_term
1314
1315     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1316         """
1317         Return auto-complete hints for tags
1318         using prefix search.
1319         """
1320         toks = self.get_tokens(string, field='SIMPLE')
1321         top = BooleanQuery()
1322
1323         for field in ['tag_name', 'tag_name_pl']:
1324             if prefix:
1325                 q = self.make_prefix_phrase(toks, field)
1326             else:
1327                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1328             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1329
1330         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1331
1332         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1333
1334     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1335         """
1336         Returns auto-complete hints for book titles
1337         Because we do not index 'pseudo' title-tags.
1338         Prefix search.
1339         """
1340         toks = self.get_tokens(string, field='SIMPLE')
1341
1342         if prefix:
1343             q = self.make_prefix_phrase(toks, 'title')
1344         else:
1345             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1346
1347         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1348
1349     @staticmethod
1350     def chain_filters(filters, op=ChainedFilter.AND):
1351         """
1352         Chains a filter list together
1353         """
1354         filters = filter(lambda x: x is not None, filters)
1355         if not filters or filters is []:
1356             return None
1357         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1358         return chf
1359
1360     def filtered_categories(self, tags):
1361         """
1362         Return a list of tag categories, present in tags list.
1363         """
1364         cats = {}
1365         for t in tags:
1366             cats[t.category] = True
1367         return cats.keys()
1368
1369     def hint(self):
1370         return Hint(self)