apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, analyzer=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         self.index = IndexWriter(self.store, self.analyzer,\
 176                                  IndexWriter.MaxFieldLength.LIMITED)
 177         return self.index
 178
 179     def optimize(self):
 180         self.index.optimize()
 181
 182     def close(self):
 183         try:
 184             self.index.optimize()
 185         except JavaError, je:
 186             print "Error during optimize phase, check index: %s" % je
 187
 188         self.index.close()
 189         self.index = None
 190
 191     def __enter__(self):
 192         self.open()
 193         return self
 194
 195     def __exit__(self, type, value, tb):
 196         self.close()
 197
 198
 199 class Index(BaseIndex):
 200     """
 201     Class indexing books.
 202     """
 203     def __init__(self, analyzer=None):
 204         super(Index, self).__init__(analyzer)
 205
 206     def index_tags(self):
 207         """
 208         Re-index global tag list.
 209         Removes all tags from index, then index them again.
 210         Indexed fields include: id, name (with and without polish stems), category
 211         """
 212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 213         self.index.deleteDocuments(q)
 214
 215         for tag in catalogue.models.Tag.objects.all():
 216             doc = Document()
 217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 221             self.index.addDocument(doc)
 222
 223         for pdtag in PDCounterAuthor.objects.all():
 224             doc = Document()
 225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
 229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 230             self.index.addDocument(doc)
 231
 232     def create_book_doc(self, book):
 233         """
 234         Create a lucene document referring book id.
 235         """
 236         doc = Document()
 237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 238         if book.parent is not None:
 239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 240         return doc
 241
 242     def remove_book(self, book):
 243         """Removes a book from search index.
 244         book - Book instance."""
 245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 246         self.index.deleteDocuments(q)
 247
 248     def index_book(self, book, book_info=None, overwrite=True):
 249         """
 250         Indexes the book.
 251         Creates a lucene document for extracted metadata
 252         and calls self.index_content() to index the contents of the book.
 253         """
 254         if overwrite:
 255             self.remove_book(book)
 256
 257         book_doc = self.create_book_doc(book)
 258         meta_fields = self.extract_metadata(book, book_info)
 259         for f in meta_fields.values():
 260             if isinstance(f, list) or isinstance(f, tuple):
 261                 for elem in f:
 262                     book_doc.add(elem)
 263             else:
 264                 book_doc.add(f)
 265
 266         self.index.addDocument(book_doc)
 267         del book_doc
 268
 269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 270
 271     master_tags = [
 272         'opowiadanie',
 273         'powiesc',
 274         'dramat_wierszowany_l',
 275         'dramat_wierszowany_lp',
 276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 277         'wywiad',
 278         ]
 279
 280     ignore_content_tags = [
 281         'uwaga', 'extra',
 282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 283         'didaskalia',
 284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 285         ]
 286
 287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 288
 289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 290
 291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 292
 293     def extract_metadata(self, book, book_info=None):
 294         """
 295         Extract metadata from book and returns a map of fields keyed by fieldname
 296         """
 297         fields = {}
 298
 299         if book_info is None:
 300             book_info = dcparser.parse(open(book.xml_file.path))
 301
 302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 305
 306         # validator, name
 307         for field in dcparser.BookInfo.FIELDS:
 308             if hasattr(book_info, field.name):
 309                 if not getattr(book_info, field.name):
 310                     continue
 311                 # since no type information is available, we use validator
 312                 type_indicator = field.validator
 313                 if type_indicator == dcparser.as_unicode:
 314                     s = getattr(book_info, field.name)
 315                     if field.multiple:
 316                         s = ', '.join(s)
 317                     try:
 318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 319                     except JavaError as je:
 320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 321                 elif type_indicator == dcparser.as_person:
 322                     p = getattr(book_info, field.name)
 323                     if isinstance(p, dcparser.Person):
 324                         persons = unicode(p)
 325                     else:
 326                         persons = ', '.join(map(unicode, p))
 327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 328                 elif type_indicator == dcparser.as_date:
 329                     dt = getattr(book_info, field.name)
 330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 332
 333         # get published date
 334         pd = None
 335         if hasattr(book_info, 'source_name') and book_info.source_name:
 336             match = self.published_date_re.search(book_info.source_name)
 337             if match is not None:
 338                 pd = str(match.groups()[0])
 339         if not pd: pd = ""
 340         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 341
 342         return fields
 343
 344     def add_gaps(self, fields, fieldname):
 345         """
 346         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 347         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 348         """
 349         def gap():
 350             while True:
 351                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 352         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 353
 354     def get_master(self, root):
 355         """
 356         Returns the first master tag from an etree.
 357         """
 358         for master in root.iter():
 359             if master.tag in self.master_tags:
 360                 return master
 361
 362     def index_content(self, book, book_fields=[]):
 363         """
 364         Walks the book XML and extract content from it.
 365         Adds parts for each header tag and for each fragment.
 366         """
 367         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 368         root = wld.edoc.getroot()
 369
 370         master = self.get_master(root)
 371         if master is None:
 372             return []
 373
 374         def walker(node, ignore_tags=[]):
 375
 376             if node.tag not in ignore_tags:
 377                 yield node, None, None
 378                 if node.text is not None:
 379                     yield None, node.text, None
 380                 for child in list(node):
 381                     for b, t, e in walker(child):
 382                         yield b, t, e
 383                 yield None, None, node
 384
 385             if node.tail is not None:
 386                 yield None, node.tail, None
 387             return
 388
 389         def fix_format(text):
 390             #            separator = [u" ", u"\t", u".", u";", u","]
 391             if isinstance(text, list):
 392                 # need to join it first
 393                 text = filter(lambda s: s is not None, content)
 394                 text = u' '.join(text)
 395                 # for i in range(len(text)):
 396                 #     if i > 0:
 397                 #         if text[i][0] not in separator\
 398                 #             and text[i - 1][-1] not in separator:
 399                 #          text.insert(i, u" ")
 400
 401             return re.sub("(?m)/$", "", text)
 402
 403         def add_part(snippets, **fields):
 404             doc = self.create_book_doc(book)
 405             for f in book_fields:
 406                 doc.add(f)
 407
 408             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 409             doc.add(NumericField("header_span", Field.Store.YES, True)\
 410                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 411             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 412
 413             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 414                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 415
 416             snip_pos = snippets.add(fields["content"])
 417             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 418             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 419
 420             if 'fragment_anchor' in fields:
 421                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 422                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 423
 424             if 'themes' in fields:
 425                 themes, themes_pl = zip(*[
 426                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 427                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 428                      for theme in fields['themes']])
 429
 430                 themes = self.add_gaps(themes, 'themes')
 431                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 432
 433                 for t in themes:
 434                     doc.add(t)
 435                 for t in themes_pl:
 436                     doc.add(t)
 437
 438             return doc
 439
 440         def give_me_utf8(s):
 441             if isinstance(s, unicode):
 442                 return s.encode('utf-8')
 443             else:
 444                 return s
 445
 446         fragments = {}
 447         snippets = Snippets(book.id).open('w')
 448         try:
 449             for header, position in zip(list(master), range(len(master))):
 450
 451                 if header.tag in self.skip_header_tags:
 452                     continue
 453                 if header.tag is etree.Comment:
 454                     continue
 455
 456                 # section content
 457                 content = []
 458                 footnote = []
 459
 460                 def all_content(text):
 461                     for frag in fragments.values():
 462                         frag['content'].append(text)
 463                     content.append(text)
 464                 handle_text = [all_content]
 465
 466
 467                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 468                     # handle footnotes
 469                     if start is not None and start.tag in self.footnote_tags:
 470                         footnote = []
 471                         def collect_footnote(t):
 472                             footnote.append(t)
 473                         handle_text.append(collect_footnote)
 474                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 475                         handle_text.pop()
 476                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 477                                        content=u''.join(footnote),
 478                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 479
 480                         self.index.addDocument(doc)
 481                         #print "@ footnote text: %s" % footnote
 482                         footnote = []
 483
 484                     # handle fragments and themes.
 485                     if start is not None and start.tag == 'begin':
 486                         fid = start.attrib['id'][1:]
 487                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 488
 489                     # themes for this fragment
 490                     elif start is not None and start.tag == 'motyw':
 491                         fid = start.attrib['id'][1:]
 492                         handle_text.append(None)
 493                         if start.text is not None:
 494                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 495                     elif end is not None and end.tag == 'motyw':
 496                         handle_text.pop()
 497
 498                     elif start is not None and start.tag == 'end':
 499                         fid = start.attrib['id'][1:]
 500                         if fid not in fragments:
 501                             continue  # a broken <end> node, skip it
 502                         frag = fragments[fid]
 503                         if frag['themes'] == []:
 504                             continue  # empty themes list.
 505                         del fragments[fid]
 506
 507                         doc = add_part(snippets,
 508                                        header_type=frag['start_header'],
 509                                        header_index=frag['start_section'],
 510                                        header_span=position - frag['start_section'] + 1,
 511                                        fragment_anchor=fid,
 512                                        content=fix_format(frag['content']),
 513                                        themes=frag['themes'])
 514                         #print '@ FRAG %s' % frag['content']
 515                         self.index.addDocument(doc)
 516
 517                         # Collect content.
 518
 519                     if text is not None and handle_text is not []:
 520                         hdl = handle_text[-1]
 521                         if hdl is not None:
 522                             hdl(text)
 523
 524                         # in the end, add a section text.
 525                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 526                                content=fix_format(content))
 527                 #print '@ CONTENT: %s' % fix_format(content)
 528
 529                 self.index.addDocument(doc)
 530
 531         finally:
 532             snippets.close()
 533
 534
 535 def log_exception_wrapper(f):
 536     def _wrap(*a):
 537         try:
 538             f(*a)
 539         except Exception, e:
 540             print("Error in indexing thread: %s" % e)
 541             traceback.print_exc()
 542             raise e
 543     return _wrap
 544
 545
 546 class ReusableIndex(Index):
 547     """
 548     Works like index, but does not close/optimize Lucene index
 549     until program exit (uses atexit hook).
 550     This is usefull for importbooks command.
 551
 552     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 553     """
 554     index = None
 555
 556     def open(self, analyzer=None, threads=4):
 557         if ReusableIndex.index is not None:
 558             self.index = ReusableIndex.index
 559         else:
 560             print("opening index")
 561             Index.open(self, analyzer)
 562             ReusableIndex.index = self.index
 563             atexit.register(ReusableIndex.close_reusable)
 564
 565     # def index_book(self, *args, **kw):
 566     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 567     #     ReusableIndex.pool_jobs.append(job)
 568
 569     @staticmethod
 570     def close_reusable():
 571         if ReusableIndex.index is not None:
 572             ReusableIndex.index.optimize()
 573             ReusableIndex.index.close()
 574             ReusableIndex.index = None
 575
 576     def close(self):
 577         pass
 578
 579
 580 class JoinSearch(object):
 581     """
 582     This mixin could be used to handle block join queries.
 583     (currently unused)
 584     """
 585     def __init__(self, *args, **kw):
 586         super(JoinSearch, self).__init__(*args, **kw)
 587
 588     def wrapjoins(self, query, fields=[]):
 589         """
 590         This functions modifies the query in a recursive way,
 591         so Term and Phrase Queries contained, which match
 592         provided fields are wrapped in a BlockJoinQuery,
 593         and so delegated to children documents.
 594         """
 595         if BooleanQuery.instance_(query):
 596             qs = BooleanQuery.cast_(query)
 597             for clause in qs:
 598                 clause = BooleanClause.cast_(clause)
 599                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 600             return qs
 601         else:
 602             termset = HashSet()
 603             query.extractTerms(termset)
 604             for t in termset:
 605                 t = Term.cast_(t)
 606                 if t.field() not in fields:
 607                     return query
 608             return BlockJoinQuery(query, self.parent_filter,
 609                                   BlockJoinQuery.ScoreMode.Total)
 610
 611     def bsearch(self, query, max_results=50):
 612         q = self.query(query)
 613         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 614
 615         tops = self.searcher.search(bjq, max_results)
 616         bks = []
 617         for found in tops.scoreDocs:
 618             doc = self.searcher.doc(found.doc)
 619             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 620         return (bks, tops.totalHits)
 621
 622
 623 class SearchResult(object):
 624     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 625         if tokens_cache is None: tokens_cache = {}
 626
 627         if score:
 628             self._score = score
 629         else:
 630             self._score = scoreDocs.score
 631
 632         self.boost = 1.0
 633
 634         self._hits = []
 635         self._processed_hits = None  # processed hits
 636
 637         stored = search.searcher.doc(scoreDocs.doc)
 638         self.book_id = int(stored.get("book_id"))
 639
 640         pd = stored.get("published_date")
 641         if pd is None:
 642             pd = 0
 643         self.published_date = int(pd)
 644
 645         header_type = stored.get("header_type")
 646         # we have a content hit in some header of fragment
 647         if header_type is not None:
 648             sec = (header_type, int(stored.get("header_index")))
 649             header_span = stored.get('header_span')
 650             header_span = header_span is not None and int(header_span) or 1
 651
 652             fragment = stored.get("fragment_anchor")
 653
 654             if snippets:
 655                 snippets = snippets.replace("/\n", "\n")
 656             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 657
 658             self._hits.append(hit)
 659
 660         self.search = search
 661         self.searched = searched
 662         self.tokens_cache = tokens_cache
 663
 664     @property
 665     def score(self):
 666         return self._score * self.boost
 667
 668     def merge(self, other):
 669         if self.book_id != other.book_id:
 670             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 671         self._hits += other._hits
 672         if other.score > self.score:
 673             self._score = other._score
 674         return self
 675
 676     def get_book(self):
 677         return catalogue.models.Book.objects.get(id=self.book_id)
 678
 679     book = property(get_book)
 680
 681     @property
 682     def hits(self):
 683         if self._processed_hits is not None:
 684             return self._processed_hits
 685
 686         POSITION = 0
 687         FRAGMENT = 1
 688         POSITION_INDEX = 1
 689         POSITION_SPAN = 2
 690         SCORE = 2
 691         OTHER = 3
 692
 693         # to sections and fragments
 694         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 695         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 696         sect = filter(lambda s: 0 == len(filter(
 697             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 698             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 699             frags)), sect)
 700
 701         hits = []
 702
 703         # remove duplicate fragments
 704         fragments = {}
 705         for f in frags:
 706             fid = f[FRAGMENT]
 707             if fid in fragments:
 708                 if fragments[fid][SCORE] >= f[SCORE]:
 709                     continue
 710             fragments[fid] = f
 711         frags = fragments.values()
 712
 713         # remove duplicate sections
 714         sections = {}
 715
 716         for s in sect:
 717             si = s[POSITION][POSITION_INDEX]
 718             # skip existing
 719             if si in sections:
 720                 if sections[si]['score'] >= s[SCORE]:
 721                     continue
 722
 723             m = {'score': s[SCORE],
 724                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 725                  }
 726             m.update(s[OTHER])
 727             sections[si] = m
 728
 729         hits = sections.values()
 730
 731         for f in frags:
 732             try:
 733                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 734             except catalogue.models.Fragment.DoesNotExist:
 735                 # stale index
 736                 continue
 737
 738             # Figure out if we were searching for a token matching some word in theme name.
 739             themes = frag.tags.filter(category='theme')
 740             themes_hit = []
 741             if self.searched is not None:
 742                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 743                 for theme in themes:
 744                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 745                     for t in tokens:
 746                         if t in name_tokens:
 747                             if not theme in themes_hit:
 748                                 themes_hit.append(theme)
 749                             break
 750
 751             m = {'score': f[SCORE],
 752                  'fragment': frag,
 753                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 754                  'themes': themes,
 755                  'themes_hit': themes_hit
 756                  }
 757             m.update(f[OTHER])
 758             hits.append(m)
 759
 760         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 761
 762         self._processed_hits = hits
 763
 764         return hits
 765
 766     def __unicode__(self):
 767         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 768
 769     @staticmethod
 770     def aggregate(*result_lists):
 771         books = {}
 772         for rl in result_lists:
 773             for r in rl:
 774                 if r.book_id in books:
 775                     books[r.book_id].merge(r)
 776                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 777                 else:
 778                     books[r.book_id] = r
 779         return books.values()
 780
 781     def __cmp__(self, other):
 782         c = cmp(self.score, other.score)
 783         if c == 0:
 784             # this is inverted, because earlier date is better
 785             return cmp(other.published_date, self.published_date)
 786         else:
 787             return c
 788
 789
 790 class Hint(object):
 791     """
 792     Given some hint information (information we already know about)
 793     our search target - like author, title (specific book), epoch, genre, kind
 794     we can narrow down search using filters.
 795     """
 796     def __init__(self, search):
 797         """
 798         Accepts a Searcher instance.
 799         """
 800         self.search = search
 801         self.book_tags = {}
 802         self.part_tags = []
 803         self._books = []
 804
 805     def books(self, *books):
 806         """
 807         Give a hint that we search these books.
 808         """
 809         self._books = books
 810
 811     def tags(self, tags):
 812         """
 813         Give a hint that these Tag objects (a list of)
 814         is necessary.
 815         """
 816         for t in tags:
 817             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 818                 lst = self.book_tags.get(t.category, [])
 819                 lst.append(t)
 820                 self.book_tags[t.category] = lst
 821             if t.category in ['theme', 'theme_pl']:
 822                 self.part_tags.append(t)
 823
 824     def tag_filter(self, tags, field='tags'):
 825         """
 826         Given a lsit of tags and an optional field (but they are normally in tags field)
 827         returns a filter accepting only books with specific tags.
 828         """
 829         q = BooleanQuery()
 830
 831         for tag in tags:
 832             toks = self.search.get_tokens(tag.name, field=field)
 833             tag_phrase = PhraseQuery()
 834             for tok in toks:
 835                 tag_phrase.add(Term(field, tok))
 836             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 837
 838         return QueryWrapperFilter(q)
 839
 840     def book_filter(self):
 841         """
 842         Filters using book tags (all tag kinds except a theme)
 843         """
 844         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 845         if tags:
 846             return self.tag_filter(tags)
 847         else:
 848             return None
 849
 850     def part_filter(self):
 851         """
 852         This filter can be used to look for book parts.
 853         It filters on book id and/or themes.
 854         """
 855         fs = []
 856         if self.part_tags:
 857             fs.append(self.tag_filter(self.part_tags, field='themes'))
 858
 859         if self._books != []:
 860             bf = BooleanFilter()
 861             for b in self._books:
 862                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 863                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 864             fs.append(bf)
 865
 866         return Search.chain_filters(fs)
 867
 868     def should_search_for_book(self):
 869         return self._books == []
 870
 871     def just_search_in(self, all):
 872         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 873         some = []
 874         for field in all:
 875             if field == 'authors' and 'author' in self.book_tags:
 876                 continue
 877             if field == 'title' and self._books != []:
 878                 continue
 879             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 880                 continue
 881             some.append(field)
 882         return some
 883
 884
 885 class Search(IndexStore):
 886     """
 887     Search facilities.
 888     """
 889     def __init__(self, default_field="content"):
 890         IndexStore.__init__(self)
 891         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 892         # self.analyzer = WLAnalyzer()
 893         self.searcher = IndexSearcher(self.store, True)
 894         self.parser = QueryParser(Version.LUCENE_34, default_field,
 895                                   self.analyzer)
 896
 897         self.parent_filter = TermsFilter()
 898         self.parent_filter.addTerm(Term("is_book", "true"))
 899
 900     def query(self, query):
 901         """Parse query in default Lucene Syntax. (for humans)
 902         """
 903         return self.parser.parse(query)
 904
 905     def simple_search(self, query, max_results=50):
 906         """Runs a query for books using lucene syntax. (for humans)
 907         Returns (books, total_hits)
 908         """
 909
 910         tops = self.searcher.search(self.query(query), max_results)
 911         bks = []
 912         for found in tops.scoreDocs:
 913             doc = self.searcher.doc(found.doc)
 914             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 915         return (bks, tops.totalHits)
 916
 917     def get_tokens(self, searched, field='content', cached=None):
 918         """returns tokens analyzed by a proper (for a field) analyzer
 919         argument can be: StringReader, string/unicode, or tokens. In the last case
 920         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 921         """
 922         if cached is not None and field in cached:
 923             return cached[field]
 924
 925         if isinstance(searched, str) or isinstance(searched, unicode):
 926             searched = StringReader(searched)
 927         elif isinstance(searched, list):
 928             return searched
 929
 930         searched.reset()
 931         tokens = self.analyzer.reusableTokenStream(field, searched)
 932         toks = []
 933         while tokens.incrementToken():
 934             cta = tokens.getAttribute(CharTermAttribute.class_)
 935             toks.append(cta.toString())
 936
 937         if cached is not None:
 938             cached[field] = toks
 939
 940         return toks
 941
 942     def fuzziness(self, fuzzy):
 943         """Helper method to sanitize fuzziness"""
 944         if not fuzzy:
 945             return None
 946         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 947             return fuzzy
 948         else:
 949             return 0.5
 950
 951     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 952         """
 953         Return a PhraseQuery with a series of tokens.
 954         """
 955         if fuzzy:
 956             phrase = MultiPhraseQuery()
 957             for t in tokens:
 958                 term = Term(field, t)
 959                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 960                 fuzzterms = []
 961
 962                 while True:
 963                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 964                     ft = fuzzterm.term()
 965                     if ft:
 966                         fuzzterms.append(ft)
 967                     if not fuzzterm.next(): break
 968                 if fuzzterms:
 969                     phrase.add(JArray('object')(fuzzterms, Term))
 970                 else:
 971                     phrase.add(term)
 972         else:
 973             phrase = PhraseQuery()
 974             phrase.setSlop(slop)
 975             for t in tokens:
 976                 term = Term(field, t)
 977                 phrase.add(term)
 978         return phrase
 979
 980     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 981         """
 982         Returns term queries joined by boolean query.
 983         modal - applies to boolean query
 984         fuzzy - should the query by fuzzy.
 985         """
 986         q = BooleanQuery()
 987         for t in tokens:
 988             term = Term(field, t)
 989             if fuzzy:
 990                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 991             else:
 992                 term = TermQuery(term)
 993             q.add(BooleanClause(term, modal))
 994         return q
 995
 996     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 997                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 998         if filters is None: filters = []
 999         if tokens_cache is None: tokens_cache = {}
1000
1001         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1002
1003         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1004         if book:
1005             filters.append(self.term_filter(Term('is_book', 'true')))
1006         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1007
1008         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1009
1010     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1011                     filters=None, tokens_cache=None, boost=None, snippets=True):
1012         if filters is None: filters = []
1013         if tokens_cache is None: tokens_cache = {}
1014
1015         if book:
1016             filters.append(self.term_filter(Term('is_book', 'true')))
1017
1018         query = BooleanQuery()
1019
1020         for fld in fields:
1021             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1022
1023             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1024                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1025
1026         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1027
1028         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1029                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1030
1031     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1032         """
1033         Search for perfect book matches. Just see if the query matches with some author or title,
1034         taking hints into account.
1035         """
1036         fields_to_search = ['authors', 'title']
1037         only_in = None
1038         if hint:
1039             if not hint.should_search_for_book():
1040                 return []
1041             fields_to_search = hint.just_search_in(fields_to_search)
1042             only_in = hint.book_filter()
1043
1044         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1045
1046         books = []
1047         for q in qrys:
1048             top = self.searcher.search(q,
1049                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1050                 max_results)
1051             for found in top.scoreDocs:
1052                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1053         return books
1054
1055     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1056         fields_to_search = ['tags', 'authors', 'title']
1057
1058         only_in = None
1059         if hint:
1060             if not hint.should_search_for_book():
1061                 return []
1062             fields_to_search = hint.just_search_in(fields_to_search)
1063             only_in = hint.book_filter()
1064
1065         tokens = self.get_tokens(searched, field='SIMPLE')
1066
1067         q = BooleanQuery()
1068
1069         for fld in fields_to_search:
1070             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1071                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1072
1073         books = []
1074         top = self.searcher.search(q,
1075                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1076             max_results)
1077         for found in top.scoreDocs:
1078             books.append(SearchResult(self, found, how_found="search_book"))
1079
1080         return books
1081
1082     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1083         """
1084         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1085         some part/fragment of the book.
1086         """
1087         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1088
1089         flt = None
1090         if hint:
1091             flt = hint.part_filter()
1092
1093         books = []
1094         for q in qrys:
1095             top = self.searcher.search(q,
1096                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1097                                                            flt]),
1098                                        max_results)
1099             for found in top.scoreDocs:
1100                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1101
1102         return books
1103
1104     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1105         """
1106         Tries to use search terms to match different fields of book (or its parts).
1107         E.g. one word can be an author survey, another be a part of the title, and the rest
1108         are some words from third chapter.
1109         """
1110         if tokens_cache is None: tokens_cache = {}
1111         books = []
1112         only_in = None
1113
1114         if hint:
1115             only_in = hint.part_filter()
1116
1117         # content only query : themes x content
1118         q = BooleanQuery()
1119
1120         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1121         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1122
1123         # only search in themes when we do not already filter by themes
1124         if hint is None or hint.just_search_in(['themes']) != []:
1125             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1126                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1127
1128         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1129                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1130
1131         topDocs = self.searcher.search(q, only_in, max_results)
1132         for found in topDocs.scoreDocs:
1133             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1134             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1135
1136         # query themes/content x author/title/tags
1137         q = BooleanQuery()
1138         in_content = BooleanQuery()
1139         in_meta = BooleanQuery()
1140
1141         for fld in ['themes_pl', 'content']:
1142             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1143
1144         for fld in ['tags', 'authors', 'title']:
1145             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1146
1147         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1148         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1149
1150         topDocs = self.searcher.search(q, only_in, max_results)
1151         for found in topDocs.scoreDocs:
1152             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1153             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1154
1155         return books
1156
1157     # def multisearch(self, query, max_results=50):
1158     #     """
1159     #     Search strategy:
1160     #     - (phrase) OR -> content
1161     #                   -> title
1162     #                   -> authors
1163     #     - (keywords)  -> authors
1164     #                   -> motyw
1165     #                   -> tags
1166     #                   -> content
1167     #     """
1168         # queryreader = StringReader(query)
1169         # tokens = self.get_tokens(queryreader)
1170
1171         # top_level = BooleanQuery()
1172         # Should = BooleanClause.Occur.SHOULD
1173
1174         # phrase_level = BooleanQuery()
1175         # phrase_level.setBoost(1.3)
1176
1177         # p_content = self.make_phrase(tokens, joined=True)
1178         # p_title = self.make_phrase(tokens, 'title')
1179         # p_author = self.make_phrase(tokens, 'author')
1180
1181         # phrase_level.add(BooleanClause(p_content, Should))
1182         # phrase_level.add(BooleanClause(p_title, Should))
1183         # phrase_level.add(BooleanClause(p_author, Should))
1184
1185         # kw_level = BooleanQuery()
1186
1187         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1188         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1189         # kw_level.add(j_themes, Should)
1190         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1191         # j_con = self.make_term_query(tokens, joined=True)
1192         # kw_level.add(j_con, Should)
1193
1194         # top_level.add(BooleanClause(phrase_level, Should))
1195         # top_level.add(BooleanClause(kw_level, Should))
1196
1197         # return None
1198
1199     def get_snippets(self, scoreDoc, query, field='content'):
1200         """
1201         Returns a snippet for found scoreDoc.
1202         """
1203         htmlFormatter = SimpleHTMLFormatter()
1204         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1205
1206         stored = self.searcher.doc(scoreDoc.doc)
1207
1208         position = stored.get('snippets_position')
1209         length = stored.get('snippets_length')
1210         if position is None or length is None:
1211             return None
1212         # locate content.
1213         snippets = Snippets(stored.get('book_id')).open()
1214         try:
1215             text = snippets.get((int(position),
1216                                  int(length)))
1217         finally:
1218             snippets.close()
1219
1220         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1221         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1222         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1223
1224         return snip
1225
1226     @staticmethod
1227     def enum_to_array(enum):
1228         """
1229         Converts a lucene TermEnum to array of Terms, suitable for
1230         addition to queries
1231         """
1232         terms = []
1233
1234         while True:
1235             t = enum.term()
1236             if t:
1237                 terms.append(t)
1238             if not enum.next(): break
1239
1240         if terms:
1241             return JArray('object')(terms, Term)
1242
1243     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1244         """
1245         Search for Tag objects using query.
1246         """
1247         if not pdcounter:
1248             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1249         tops = self.searcher.search(query, filters, max_results)
1250
1251         tags = []
1252         for found in tops.scoreDocs:
1253             doc = self.searcher.doc(found.doc)
1254             is_pdcounter = doc.get('is_pdcounter')
1255             if is_pdcounter:
1256                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1257             else:
1258                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1259                 # don't add the pdcounter tag if same tag already exists
1260             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1261                 tags.append(tag)
1262                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1263         print 'returning %s' % tags
1264         return tags
1265
1266     def search_books(self, query, filter=None, max_results=10):
1267         """
1268         Searches for Book objects using query
1269         """
1270         bks = []
1271         tops = self.searcher.search(query, filter, max_results)
1272         for found in tops.scoreDocs:
1273             doc = self.searcher.doc(found.doc)
1274             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1275         return bks
1276
1277     def make_prefix_phrase(self, toks, field):
1278         q = MultiPhraseQuery()
1279         for i in range(len(toks)):
1280             t = Term(field, toks[i])
1281             if i == len(toks) - 1:
1282                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1283                 if pterms:
1284                     q.add(pterms)
1285                 else:
1286                     q.add(t)
1287             else:
1288                 q.add(t)
1289         return q
1290
1291     @staticmethod
1292     def term_filter(term, inverse=False):
1293         only_term = TermsFilter()
1294         only_term.addTerm(term)
1295
1296         if inverse:
1297             neg = BooleanFilter()
1298             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1299             only_term = neg
1300
1301         return only_term
1302
1303     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1304         """
1305         Return auto-complete hints for tags
1306         using prefix search.
1307         """
1308         toks = self.get_tokens(string, field='SIMPLE')
1309         top = BooleanQuery()
1310
1311         for field in ['tag_name', 'tag_name_pl']:
1312             if prefix:
1313                 q = self.make_prefix_phrase(toks, field)
1314             else:
1315                 q = self.make_term_query(toks, field)
1316             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1317
1318         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1319
1320         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1321
1322     def hint_books(self, string, max_results=50, prefix=True):
1323         """
1324         Returns auto-complete hints for book titles
1325         Because we do not index 'pseudo' title-tags.
1326         Prefix search.
1327         """
1328         toks = self.get_tokens(string, field='SIMPLE')
1329
1330         if prefix:
1331             q = self.make_prefix_phrase(toks, 'title')
1332         else:
1333             q = self.make_term_query(toks, 'title')
1334
1335         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1336
1337     @staticmethod
1338     def chain_filters(filters, op=ChainedFilter.AND):
1339         """
1340         Chains a filter list together
1341         """
1342         filters = filter(lambda x: x is not None, filters)
1343         if not filters or filters is []:
1344             return None
1345         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1346         return chf
1347
1348     def filtered_categories(self, tags):
1349         """
1350         Return a list of tag categories, present in tags list.
1351         """
1352         cats = {}
1353         for t in tags:
1354             cats[t.category] = True
1355         return cats.keys()
1356
1357     def hint(self):
1358         return Hint(self)