apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, analyzer=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         self.index = IndexWriter(self.store, self.analyzer,\
 176                                  IndexWriter.MaxFieldLength.LIMITED)
 177         return self.index
 178
 179     def optimize(self):
 180         self.index.optimize()
 181
 182     def close(self):
 183         try:
 184             self.index.optimize()
 185         except JavaError, je:
 186             print "Error during optimize phase, check index: %s" % je
 187
 188         self.index.close()
 189         self.index = None
 190
 191     def __enter__(self):
 192         self.open()
 193         return self
 194
 195     def __exit__(self, type, value, tb):
 196         self.close()
 197
 198
 199 class Index(BaseIndex):
 200     """
 201     Class indexing books.
 202     """
 203     def __init__(self, analyzer=None):
 204         super(Index, self).__init__(analyzer)
 205
 206     def index_tags(self):
 207         """
 208         Re-index global tag list.
 209         Removes all tags from index, then index them again.
 210         Indexed fields include: id, name (with and without polish stems), category
 211         """
 212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 213         self.index.deleteDocuments(q)
 214
 215         for tag in catalogue.models.Tag.objects.all():
 216             doc = Document()
 217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 221             self.index.addDocument(doc)
 222
 223         for pdtag in PDCounterAuthor.objects.all():
 224             doc = Document()
 225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
 229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 230             self.index.addDocument(doc)
 231
 232     def create_book_doc(self, book):
 233         """
 234         Create a lucene document referring book id.
 235         """
 236         doc = Document()
 237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 238         if book.parent is not None:
 239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 240         return doc
 241
 242     def remove_book(self, book):
 243         """Removes a book from search index.
 244         book - Book instance."""
 245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 246         self.index.deleteDocuments(q)
 247
 248     def index_book(self, book, book_info=None, overwrite=True):
 249         """
 250         Indexes the book.
 251         Creates a lucene document for extracted metadata
 252         and calls self.index_content() to index the contents of the book.
 253         """
 254         if overwrite:
 255             self.remove_book(book)
 256
 257         book_doc = self.create_book_doc(book)
 258         meta_fields = self.extract_metadata(book, book_info)
 259         for f in meta_fields.values():
 260             if isinstance(f, list) or isinstance(f, tuple):
 261                 for elem in f:
 262                     book_doc.add(elem)
 263             else:
 264                 book_doc.add(f)
 265
 266         self.index.addDocument(book_doc)
 267         del book_doc
 268
 269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 270
 271     master_tags = [
 272         'opowiadanie',
 273         'powiesc',
 274         'dramat_wierszowany_l',
 275         'dramat_wierszowany_lp',
 276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 277         'wywiad',
 278         ]
 279
 280     ignore_content_tags = [
 281         'uwaga', 'extra',
 282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 283         'didaskalia',
 284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 285         ]
 286
 287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 288
 289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 290
 291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 292
 293     def extract_metadata(self, book, book_info=None):
 294         """
 295         Extract metadata from book and returns a map of fields keyed by fieldname
 296         """
 297         fields = {}
 298
 299         if book_info is None:
 300             book_info = dcparser.parse(open(book.xml_file.path))
 301
 302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 305
 306         # validator, name
 307         for field in dcparser.BookInfo.FIELDS:
 308             if hasattr(book_info, field.name):
 309                 if not getattr(book_info, field.name):
 310                     continue
 311                 # since no type information is available, we use validator
 312                 type_indicator = field.validator
 313                 if type_indicator == dcparser.as_unicode:
 314                     s = getattr(book_info, field.name)
 315                     if field.multiple:
 316                         s = ', '.join(s)
 317                     try:
 318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 319                     except JavaError as je:
 320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 321                 elif type_indicator == dcparser.as_person:
 322                     p = getattr(book_info, field.name)
 323                     if isinstance(p, dcparser.Person):
 324                         persons = unicode(p)
 325                     else:
 326                         persons = ', '.join(map(unicode, p))
 327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 328                 elif type_indicator == dcparser.as_date:
 329                     dt = getattr(book_info, field.name)
 330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 332
 333         # get published date
 334         source = book_info.source_name
 335         if hasattr(book_info, 'source_name'):
 336             match = self.published_date_re.search(source)
 337             if match is not None:
 338                 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
 339
 340         return fields
 341
 342     def add_gaps(self, fields, fieldname):
 343         """
 344         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 345         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 346         """
 347         def gap():
 348             while True:
 349                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 350         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 351
 352     def get_master(self, root):
 353         """
 354         Returns the first master tag from an etree.
 355         """
 356         for master in root.iter():
 357             if master.tag in self.master_tags:
 358                 return master
 359
 360     def index_content(self, book, book_fields=[]):
 361         """
 362         Walks the book XML and extract content from it.
 363         Adds parts for each header tag and for each fragment.
 364         """
 365         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 366         root = wld.edoc.getroot()
 367
 368         master = self.get_master(root)
 369         if master is None:
 370             return []
 371
 372         def walker(node, ignore_tags=[]):
 373
 374             if node.tag not in ignore_tags:
 375                 yield node, None, None
 376                 if node.text is not None:
 377                     yield None, node.text, None
 378                 for child in list(node):
 379                     for b, t, e in walker(child):
 380                         yield b, t, e
 381                 yield None, None, node
 382
 383             if node.tail is not None:
 384                 yield None, node.tail, None
 385             return
 386
 387         def fix_format(text):
 388             #            separator = [u" ", u"\t", u".", u";", u","]
 389             if isinstance(text, list):
 390                 # need to join it first
 391                 text = filter(lambda s: s is not None, content)
 392                 text = u' '.join(text)
 393                 # for i in range(len(text)):
 394                 #     if i > 0:
 395                 #         if text[i][0] not in separator\
 396                 #             and text[i - 1][-1] not in separator:
 397                 #          text.insert(i, u" ")
 398
 399             return re.sub("(?m)/$", "", text)
 400
 401         def add_part(snippets, **fields):
 402             doc = self.create_book_doc(book)
 403             for f in book_fields:
 404                 doc.add(f)
 405
 406             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 407             doc.add(NumericField("header_span", Field.Store.YES, True)\
 408                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 409             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 410
 411             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 412                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 413
 414             snip_pos = snippets.add(fields["content"])
 415             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 416             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 417
 418             if 'fragment_anchor' in fields:
 419                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 420                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 421
 422             if 'themes' in fields:
 423                 themes, themes_pl = zip(*[
 424                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 425                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 426                      for theme in fields['themes']])
 427
 428                 themes = self.add_gaps(themes, 'themes')
 429                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 430
 431                 for t in themes:
 432                     doc.add(t)
 433                 for t in themes_pl:
 434                     doc.add(t)
 435
 436             return doc
 437
 438         def give_me_utf8(s):
 439             if isinstance(s, unicode):
 440                 return s.encode('utf-8')
 441             else:
 442                 return s
 443
 444         fragments = {}
 445         snippets = Snippets(book.id).open('w')
 446         try:
 447             for header, position in zip(list(master), range(len(master))):
 448
 449                 if header.tag in self.skip_header_tags:
 450                     continue
 451                 if header.tag is etree.Comment:
 452                     continue
 453
 454                 # section content
 455                 content = []
 456                 footnote = []
 457
 458                 def all_content(text):
 459                     for frag in fragments.values():
 460                         frag['content'].append(text)
 461                     content.append(text)
 462                 handle_text = [all_content]
 463
 464
 465                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 466                     # handle footnotes
 467                     if start is not None and start.tag in self.footnote_tags:
 468                         footnote = []
 469                         def collect_footnote(t):
 470                             footnote.append(t)
 471                         handle_text.append(collect_footnote)
 472                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 473                         handle_text.pop()
 474                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 475                                        content=u''.join(footnote),
 476                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 477
 478                         self.index.addDocument(doc)
 479                         print "@ footnote text: %s" % footnote
 480                         footnote = []
 481
 482                     # handle fragments and themes.
 483                     if start is not None and start.tag == 'begin':
 484                         fid = start.attrib['id'][1:]
 485                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 486
 487                     # themes for this fragment
 488                     elif start is not None and start.tag == 'motyw':
 489                         fid = start.attrib['id'][1:]
 490                         handle_text.append(None)
 491                         if start.text is not None:
 492                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 493                     elif end is not None and end.tag == 'motyw':
 494                         handle_text.pop()
 495
 496                     elif start is not None and start.tag == 'end':
 497                         fid = start.attrib['id'][1:]
 498                         if fid not in fragments:
 499                             continue  # a broken <end> node, skip it
 500                         frag = fragments[fid]
 501                         if frag['themes'] == []:
 502                             continue  # empty themes list.
 503                         del fragments[fid]
 504
 505                         doc = add_part(snippets,
 506                                        header_type=frag['start_header'],
 507                                        header_index=frag['start_section'],
 508                                        header_span=position - frag['start_section'] + 1,
 509                                        fragment_anchor=fid,
 510                                        content=fix_format(frag['content']),
 511                                        themes=frag['themes'])
 512                         print '@ FRAG %s' % frag['content']
 513                         self.index.addDocument(doc)
 514
 515                         # Collect content.
 516
 517                     if text is not None and handle_text is not []:
 518                         hdl = handle_text[-1]
 519                         if hdl is not None:
 520                             hdl(text)
 521
 522                         # in the end, add a section text.
 523                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 524                                content=fix_format(content))
 525                 print '@ CONTENT: %s' % fix_format(content)
 526
 527                 self.index.addDocument(doc)
 528
 529         finally:
 530             snippets.close()
 531
 532
 533 def log_exception_wrapper(f):
 534     def _wrap(*a):
 535         try:
 536             f(*a)
 537         except Exception, e:
 538             print("Error in indexing thread: %s" % e)
 539             traceback.print_exc()
 540             raise e
 541     return _wrap
 542
 543
 544 class ReusableIndex(Index):
 545     """
 546     Works like index, but does not close/optimize Lucene index
 547     until program exit (uses atexit hook).
 548     This is usefull for importbooks command.
 549
 550     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 551     """
 552     index = None
 553
 554     def open(self, analyzer=None, threads=4):
 555         if ReusableIndex.index is not None:
 556             self.index = ReusableIndex.index
 557         else:
 558             print("opening index")
 559             Index.open(self, analyzer)
 560             ReusableIndex.index = self.index
 561             atexit.register(ReusableIndex.close_reusable)
 562
 563     # def index_book(self, *args, **kw):
 564     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 565     #     ReusableIndex.pool_jobs.append(job)
 566
 567     @staticmethod
 568     def close_reusable():
 569         if ReusableIndex.index is not None:
 570             ReusableIndex.index.optimize()
 571             ReusableIndex.index.close()
 572             ReusableIndex.index = None
 573
 574     def close(self):
 575         pass
 576
 577
 578 class JoinSearch(object):
 579     """
 580     This mixin could be used to handle block join queries.
 581     (currently unused)
 582     """
 583     def __init__(self, *args, **kw):
 584         super(JoinSearch, self).__init__(*args, **kw)
 585
 586     def wrapjoins(self, query, fields=[]):
 587         """
 588         This functions modifies the query in a recursive way,
 589         so Term and Phrase Queries contained, which match
 590         provided fields are wrapped in a BlockJoinQuery,
 591         and so delegated to children documents.
 592         """
 593         if BooleanQuery.instance_(query):
 594             qs = BooleanQuery.cast_(query)
 595             for clause in qs:
 596                 clause = BooleanClause.cast_(clause)
 597                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 598             return qs
 599         else:
 600             termset = HashSet()
 601             query.extractTerms(termset)
 602             for t in termset:
 603                 t = Term.cast_(t)
 604                 if t.field() not in fields:
 605                     return query
 606             return BlockJoinQuery(query, self.parent_filter,
 607                                   BlockJoinQuery.ScoreMode.Total)
 608
 609     def bsearch(self, query, max_results=50):
 610         q = self.query(query)
 611         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 612
 613         tops = self.searcher.search(bjq, max_results)
 614         bks = []
 615         for found in tops.scoreDocs:
 616             doc = self.searcher.doc(found.doc)
 617             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 618         return (bks, tops.totalHits)
 619
 620
 621 class SearchResult(object):
 622     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 623         if tokens_cache is None: tokens_cache = {}
 624
 625         if score:
 626             self._score = score
 627         else:
 628             self._score = scoreDocs.score
 629
 630         self.boost = 1.0
 631
 632         self._hits = []
 633         self._processed_hits = None  # processed hits
 634
 635         stored = search.searcher.doc(scoreDocs.doc)
 636         self.book_id = int(stored.get("book_id"))
 637
 638         pd = stored.get("published_date")
 639         if pd is None:
 640             pd = 0
 641         self.published_date = int(pd)
 642
 643         header_type = stored.get("header_type")
 644         # we have a content hit in some header of fragment
 645         if header_type is not None:
 646             sec = (header_type, int(stored.get("header_index")))
 647             header_span = stored.get('header_span')
 648             header_span = header_span is not None and int(header_span) or 1
 649
 650             fragment = stored.get("fragment_anchor")
 651
 652             if snippets:
 653                 snippets = snippets.replace("/\n", "\n")
 654             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 655
 656             self._hits.append(hit)
 657
 658         self.search = search
 659         self.searched = searched
 660         self.tokens_cache = tokens_cache
 661
 662     @property
 663     def score(self):
 664         return self._score * self.boost
 665
 666     def merge(self, other):
 667         if self.book_id != other.book_id:
 668             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 669         self._hits += other._hits
 670         if other.score > self.score:
 671             self._score = other._score
 672         return self
 673
 674     def get_book(self):
 675         return catalogue.models.Book.objects.get(id=self.book_id)
 676
 677     book = property(get_book)
 678
 679     @property
 680     def hits(self):
 681         if self._processed_hits is not None:
 682             return self._processed_hits
 683
 684         POSITION = 0
 685         FRAGMENT = 1
 686         POSITION_INDEX = 1
 687         POSITION_SPAN = 2
 688         SCORE = 2
 689         OTHER = 3
 690
 691         # to sections and fragments
 692         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 693         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 694         sect = filter(lambda s: 0 == len(filter(
 695             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 696             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 697             frags)), sect)
 698
 699         hits = []
 700
 701         # remove duplicate fragments
 702         fragments = {}
 703         for f in frags:
 704             fid = f[FRAGMENT]
 705             if fid in fragments:
 706                 if fragments[fid][SCORE] >= f[SCORE]:
 707                     continue
 708             fragments[fid] = f
 709         frags = fragments.values()
 710
 711         # remove duplicate sections
 712         sections = {}
 713
 714         for s in sect:
 715             si = s[POSITION][POSITION_INDEX]
 716             # skip existing
 717             if si in sections:
 718                 if sections[si]['score'] >= s[SCORE]:
 719                     continue
 720
 721             m = {'score': s[SCORE],
 722                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 723                  }
 724             m.update(s[OTHER])
 725             sections[si] = m
 726
 727         hits = sections.values()
 728
 729         for f in frags:
 730             try:
 731                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 732             except catalogue.models.Fragment.DoesNotExist:
 733                 # stale index
 734                 continue
 735
 736             # Figure out if we were searching for a token matching some word in theme name.
 737             themes = frag.tags.filter(category='theme')
 738             themes_hit = []
 739             if self.searched is not None:
 740                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 741                 for theme in themes:
 742                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 743                     for t in tokens:
 744                         if t in name_tokens:
 745                             if not theme in themes_hit:
 746                                 themes_hit.append(theme)
 747                             break
 748
 749             m = {'score': f[SCORE],
 750                  'fragment': frag,
 751                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 752                  'themes': themes,
 753                  'themes_hit': themes_hit
 754                  }
 755             m.update(f[OTHER])
 756             hits.append(m)
 757
 758         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 759
 760         self._processed_hits = hits
 761
 762         return hits
 763
 764     def __unicode__(self):
 765         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 766
 767     @staticmethod
 768     def aggregate(*result_lists):
 769         books = {}
 770         for rl in result_lists:
 771             for r in rl:
 772                 if r.book_id in books:
 773                     books[r.book_id].merge(r)
 774                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 775                 else:
 776                     books[r.book_id] = r
 777         return books.values()
 778
 779     def __cmp__(self, other):
 780         c = cmp(self.score, other.score)
 781         if c == 0:
 782             # this is inverted, because earlier date is better
 783             return cmp(other.published_date, self.published_date)
 784         else:
 785             return c
 786
 787
 788 class Hint(object):
 789     """
 790     Given some hint information (information we already know about)
 791     our search target - like author, title (specific book), epoch, genre, kind
 792     we can narrow down search using filters.
 793     """
 794     def __init__(self, search):
 795         """
 796         Accepts a Searcher instance.
 797         """
 798         self.search = search
 799         self.book_tags = {}
 800         self.part_tags = []
 801         self._books = []
 802
 803     def books(self, *books):
 804         """
 805         Give a hint that we search these books.
 806         """
 807         self._books = books
 808
 809     def tags(self, tags):
 810         """
 811         Give a hint that these Tag objects (a list of)
 812         is necessary.
 813         """
 814         for t in tags:
 815             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 816                 lst = self.book_tags.get(t.category, [])
 817                 lst.append(t)
 818                 self.book_tags[t.category] = lst
 819             if t.category in ['theme', 'theme_pl']:
 820                 self.part_tags.append(t)
 821
 822     def tag_filter(self, tags, field='tags'):
 823         """
 824         Given a lsit of tags and an optional field (but they are normally in tags field)
 825         returns a filter accepting only books with specific tags.
 826         """
 827         q = BooleanQuery()
 828
 829         for tag in tags:
 830             toks = self.search.get_tokens(tag.name, field=field)
 831             tag_phrase = PhraseQuery()
 832             for tok in toks:
 833                 tag_phrase.add(Term(field, tok))
 834             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 835
 836         return QueryWrapperFilter(q)
 837
 838     def book_filter(self):
 839         """
 840         Filters using book tags (all tag kinds except a theme)
 841         """
 842         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 843         if tags:
 844             return self.tag_filter(tags)
 845         else:
 846             return None
 847
 848     def part_filter(self):
 849         """
 850         This filter can be used to look for book parts.
 851         It filters on book id and/or themes.
 852         """
 853         fs = []
 854         if self.part_tags:
 855             fs.append(self.tag_filter(self.part_tags, field='themes'))
 856
 857         if self._books != []:
 858             bf = BooleanFilter()
 859             for b in self._books:
 860                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 861                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 862             fs.append(bf)
 863
 864         return Search.chain_filters(fs)
 865
 866     def should_search_for_book(self):
 867         return self._books == []
 868
 869     def just_search_in(self, all):
 870         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 871         some = []
 872         for field in all:
 873             if field == 'authors' and 'author' in self.book_tags:
 874                 continue
 875             if field == 'title' and self._books != []:
 876                 continue
 877             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 878                 continue
 879             some.append(field)
 880         return some
 881
 882
 883 class Search(IndexStore):
 884     """
 885     Search facilities.
 886     """
 887     def __init__(self, default_field="content"):
 888         IndexStore.__init__(self)
 889         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 890         # self.analyzer = WLAnalyzer()
 891         self.searcher = IndexSearcher(self.store, True)
 892         self.parser = QueryParser(Version.LUCENE_34, default_field,
 893                                   self.analyzer)
 894
 895         self.parent_filter = TermsFilter()
 896         self.parent_filter.addTerm(Term("is_book", "true"))
 897
 898     def query(self, query):
 899         """Parse query in default Lucene Syntax. (for humans)
 900         """
 901         return self.parser.parse(query)
 902
 903     def simple_search(self, query, max_results=50):
 904         """Runs a query for books using lucene syntax. (for humans)
 905         Returns (books, total_hits)
 906         """
 907
 908         tops = self.searcher.search(self.query(query), max_results)
 909         bks = []
 910         for found in tops.scoreDocs:
 911             doc = self.searcher.doc(found.doc)
 912             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 913         return (bks, tops.totalHits)
 914
 915     def get_tokens(self, searched, field='content', cached=None):
 916         """returns tokens analyzed by a proper (for a field) analyzer
 917         argument can be: StringReader, string/unicode, or tokens. In the last case
 918         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 919         """
 920         if cached is not None and field in cached:
 921             return cached[field]
 922
 923         if isinstance(searched, str) or isinstance(searched, unicode):
 924             searched = StringReader(searched)
 925         elif isinstance(searched, list):
 926             return searched
 927
 928         searched.reset()
 929         tokens = self.analyzer.reusableTokenStream(field, searched)
 930         toks = []
 931         while tokens.incrementToken():
 932             cta = tokens.getAttribute(CharTermAttribute.class_)
 933             toks.append(cta.toString())
 934
 935         if cached is not None:
 936             cached[field] = toks
 937
 938         return toks
 939
 940     def fuzziness(self, fuzzy):
 941         """Helper method to sanitize fuzziness"""
 942         if not fuzzy:
 943             return None
 944         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 945             return fuzzy
 946         else:
 947             return 0.5
 948
 949     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 950         """
 951         Return a PhraseQuery with a series of tokens.
 952         """
 953         if fuzzy:
 954             phrase = MultiPhraseQuery()
 955             for t in tokens:
 956                 term = Term(field, t)
 957                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 958                 fuzzterms = []
 959
 960                 while True:
 961                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 962                     ft = fuzzterm.term()
 963                     if ft:
 964                         fuzzterms.append(ft)
 965                     if not fuzzterm.next(): break
 966                 if fuzzterms:
 967                     phrase.add(JArray('object')(fuzzterms, Term))
 968                 else:
 969                     phrase.add(term)
 970         else:
 971             phrase = PhraseQuery()
 972             phrase.setSlop(slop)
 973             for t in tokens:
 974                 term = Term(field, t)
 975                 phrase.add(term)
 976         return phrase
 977
 978     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 979         """
 980         Returns term queries joined by boolean query.
 981         modal - applies to boolean query
 982         fuzzy - should the query by fuzzy.
 983         """
 984         q = BooleanQuery()
 985         for t in tokens:
 986             term = Term(field, t)
 987             if fuzzy:
 988                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 989             else:
 990                 term = TermQuery(term)
 991             q.add(BooleanClause(term, modal))
 992         return q
 993
 994     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 995                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 996         if filters is None: filters = []
 997         if tokens_cache is None: tokens_cache = {}
 998
 999         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1000
1001         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1002         if book:
1003             filters.append(self.term_filter(Term('is_book', 'true')))
1004         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1005
1006         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1007
1008     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1009                     filters=None, tokens_cache=None, boost=None, snippets=True):
1010         if filters is None: filters = []
1011         if tokens_cache is None: tokens_cache = {}
1012
1013         if book:
1014             filters.append(self.term_filter(Term('is_book', 'true')))
1015
1016         query = BooleanQuery()
1017
1018         for fld in fields:
1019             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1020
1021             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1022                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1023
1024         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1025
1026         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1027                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1028
1029     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1030         """
1031         Search for perfect book matches. Just see if the query matches with some author or title,
1032         taking hints into account.
1033         """
1034         fields_to_search = ['authors', 'title']
1035         only_in = None
1036         if hint:
1037             if not hint.should_search_for_book():
1038                 return []
1039             fields_to_search = hint.just_search_in(fields_to_search)
1040             only_in = hint.book_filter()
1041
1042         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1043
1044         books = []
1045         for q in qrys:
1046             top = self.searcher.search(q,
1047                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1048                 max_results)
1049             for found in top.scoreDocs:
1050                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1051         return books
1052
1053     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1054         fields_to_search = ['tags', 'authors', 'title']
1055
1056         only_in = None
1057         if hint:
1058             if not hint.should_search_for_book():
1059                 return []
1060             fields_to_search = hint.just_search_in(fields_to_search)
1061             only_in = hint.book_filter()
1062
1063         tokens = self.get_tokens(searched, field='SIMPLE')
1064
1065         q = BooleanQuery()
1066
1067         for fld in fields_to_search:
1068             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1069                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1070
1071         books = []
1072         top = self.searcher.search(q,
1073                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1074             max_results)
1075         for found in top.scoreDocs:
1076             books.append(SearchResult(self, found, how_found="search_book"))
1077
1078         return books
1079
1080     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1081         """
1082         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1083         some part/fragment of the book.
1084         """
1085         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1086
1087         flt = None
1088         if hint:
1089             flt = hint.part_filter()
1090
1091         books = []
1092         for q in qrys:
1093             top = self.searcher.search(q,
1094                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1095                                                            flt]),
1096                                        max_results)
1097             for found in top.scoreDocs:
1098                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1099
1100         return books
1101
1102     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1103         """
1104         Tries to use search terms to match different fields of book (or its parts).
1105         E.g. one word can be an author survey, another be a part of the title, and the rest
1106         are some words from third chapter.
1107         """
1108         if tokens_cache is None: tokens_cache = {}
1109         books = []
1110         only_in = None
1111
1112         if hint:
1113             only_in = hint.part_filter()
1114
1115         # content only query : themes x content
1116         q = BooleanQuery()
1117
1118         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1119         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1120
1121         # only search in themes when we do not already filter by themes
1122         if hint is None or hint.just_search_in(['themes']) != []:
1123             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1124                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1125
1126         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1127                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1128
1129         topDocs = self.searcher.search(q, only_in, max_results)
1130         for found in topDocs.scoreDocs:
1131             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1132             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1133
1134         # query themes/content x author/title/tags
1135         q = BooleanQuery()
1136         in_content = BooleanQuery()
1137         in_meta = BooleanQuery()
1138
1139         for fld in ['themes_pl', 'content']:
1140             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1141
1142         for fld in ['tags', 'authors', 'title']:
1143             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1144
1145         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1146         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1147
1148         topDocs = self.searcher.search(q, only_in, max_results)
1149         for found in topDocs.scoreDocs:
1150             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1151             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1152
1153         return books
1154
1155     # def multisearch(self, query, max_results=50):
1156     #     """
1157     #     Search strategy:
1158     #     - (phrase) OR -> content
1159     #                   -> title
1160     #                   -> authors
1161     #     - (keywords)  -> authors
1162     #                   -> motyw
1163     #                   -> tags
1164     #                   -> content
1165     #     """
1166         # queryreader = StringReader(query)
1167         # tokens = self.get_tokens(queryreader)
1168
1169         # top_level = BooleanQuery()
1170         # Should = BooleanClause.Occur.SHOULD
1171
1172         # phrase_level = BooleanQuery()
1173         # phrase_level.setBoost(1.3)
1174
1175         # p_content = self.make_phrase(tokens, joined=True)
1176         # p_title = self.make_phrase(tokens, 'title')
1177         # p_author = self.make_phrase(tokens, 'author')
1178
1179         # phrase_level.add(BooleanClause(p_content, Should))
1180         # phrase_level.add(BooleanClause(p_title, Should))
1181         # phrase_level.add(BooleanClause(p_author, Should))
1182
1183         # kw_level = BooleanQuery()
1184
1185         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1186         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1187         # kw_level.add(j_themes, Should)
1188         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1189         # j_con = self.make_term_query(tokens, joined=True)
1190         # kw_level.add(j_con, Should)
1191
1192         # top_level.add(BooleanClause(phrase_level, Should))
1193         # top_level.add(BooleanClause(kw_level, Should))
1194
1195         # return None
1196
1197     def get_snippets(self, scoreDoc, query, field='content'):
1198         """
1199         Returns a snippet for found scoreDoc.
1200         """
1201         htmlFormatter = SimpleHTMLFormatter()
1202         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1203
1204         stored = self.searcher.doc(scoreDoc.doc)
1205
1206         position = stored.get('snippets_position')
1207         length = stored.get('snippets_length')
1208         if position is None or length is None:
1209             return None
1210         # locate content.
1211         snippets = Snippets(stored.get('book_id')).open()
1212         try:
1213             text = snippets.get((int(position),
1214                                  int(length)))
1215         finally:
1216             snippets.close()
1217
1218         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1219         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1220         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1221
1222         return snip
1223
1224     @staticmethod
1225     def enum_to_array(enum):
1226         """
1227         Converts a lucene TermEnum to array of Terms, suitable for
1228         addition to queries
1229         """
1230         terms = []
1231
1232         while True:
1233             t = enum.term()
1234             if t:
1235                 terms.append(t)
1236             if not enum.next(): break
1237
1238         if terms:
1239             return JArray('object')(terms, Term)
1240
1241     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1242         """
1243         Search for Tag objects using query.
1244         """
1245         if not pdcounter:
1246             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1247         tops = self.searcher.search(query, filters, max_results)
1248
1249         tags = []
1250         for found in tops.scoreDocs:
1251             doc = self.searcher.doc(found.doc)
1252             is_pdcounter = doc.get('is_pdcounter')
1253             if is_pdcounter:
1254                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1255             else:
1256                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1257                 # don't add the pdcounter tag if same tag already exists
1258             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1259                 tags.append(tag)
1260                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1261         print 'returning %s' % tags
1262         return tags
1263
1264     def search_books(self, query, filter=None, max_results=10):
1265         """
1266         Searches for Book objects using query
1267         """
1268         bks = []
1269         tops = self.searcher.search(query, filter, max_results)
1270         for found in tops.scoreDocs:
1271             doc = self.searcher.doc(found.doc)
1272             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1273         return bks
1274
1275     def make_prefix_phrase(self, toks, field):
1276         q = MultiPhraseQuery()
1277         for i in range(len(toks)):
1278             t = Term(field, toks[i])
1279             if i == len(toks) - 1:
1280                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1281                 if pterms:
1282                     q.add(pterms)
1283                 else:
1284                     q.add(t)
1285             else:
1286                 q.add(t)
1287         return q
1288
1289     @staticmethod
1290     def term_filter(term, inverse=False):
1291         only_term = TermsFilter()
1292         only_term.addTerm(term)
1293
1294         if inverse:
1295             neg = BooleanFilter()
1296             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1297             only_term = neg
1298
1299         return only_term
1300
1301     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1302         """
1303         Return auto-complete hints for tags
1304         using prefix search.
1305         """
1306         toks = self.get_tokens(string, field='SIMPLE')
1307         top = BooleanQuery()
1308
1309         for field in ['tag_name', 'tag_name_pl']:
1310             if prefix:
1311                 q = self.make_prefix_phrase(toks, field)
1312             else:
1313                 q = self.make_term_query(toks, field)
1314             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1315
1316         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1317
1318         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1319
1320     def hint_books(self, string, max_results=50, prefix=True):
1321         """
1322         Returns auto-complete hints for book titles
1323         Because we do not index 'pseudo' title-tags.
1324         Prefix search.
1325         """
1326         toks = self.get_tokens(string, field='SIMPLE')
1327
1328         if prefix:
1329             q = self.make_prefix_phrase(toks, 'title')
1330         else:
1331             q = self.make_term_query(toks, 'title')
1332
1333         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1334
1335     @staticmethod
1336     def chain_filters(filters, op=ChainedFilter.AND):
1337         """
1338         Chains a filter list together
1339         """
1340         filters = filter(lambda x: x is not None, filters)
1341         if not filters or filters is []:
1342             return None
1343         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1344         return chf
1345
1346     def filtered_categories(self, tags):
1347         """
1348         Return a list of tag categories, present in tags list.
1349         """
1350         cats = {}
1351         for t in tags:
1352             cats[t.category] = True
1353         return cats.keys()
1354
1355     def hint(self):
1356         return Hint(self)