apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, analyzer=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         self.index = IndexWriter(self.store, self.analyzer,\
 176                                  IndexWriter.MaxFieldLength.LIMITED)
 177         return self.index
 178
 179     def optimize(self):
 180         self.index.optimize()
 181
 182     def close(self):
 183         try:
 184             self.index.optimize()
 185         except JavaError, je:
 186             print "Error during optimize phase, check index: %s" % je
 187
 188         self.index.close()
 189         self.index = None
 190
 191     def __enter__(self):
 192         self.open()
 193         return self
 194
 195     def __exit__(self, type, value, tb):
 196         self.close()
 197
 198
 199 class Index(BaseIndex):
 200     """
 201     Class indexing books.
 202     """
 203     def __init__(self, analyzer=None):
 204         super(Index, self).__init__(analyzer)
 205
 206     def index_tags(self):
 207         """
 208         Re-index global tag list.
 209         Removes all tags from index, then index them again.
 210         Indexed fields include: id, name (with and without polish stems), category
 211         """
 212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 213         self.index.deleteDocuments(q)
 214
 215         for tag in catalogue.models.Tag.objects.all():
 216             doc = Document()
 217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 221             self.index.addDocument(doc)
 222
 223         for pdtag in PDCounterAuthor.objects.all():
 224             doc = Document()
 225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
 229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 230             self.index.addDocument(doc)
 231
 232     def create_book_doc(self, book):
 233         """
 234         Create a lucene document referring book id.
 235         """
 236         doc = Document()
 237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 238         if book.parent is not None:
 239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 240         return doc
 241
 242     def remove_book(self, book):
 243         """Removes a book from search index.
 244         book - Book instance."""
 245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 246         self.index.deleteDocuments(q)
 247
 248     def index_book(self, book, book_info=None, overwrite=True):
 249         """
 250         Indexes the book.
 251         Creates a lucene document for extracted metadata
 252         and calls self.index_content() to index the contents of the book.
 253         """
 254         if overwrite:
 255             self.remove_book(book)
 256
 257         book_doc = self.create_book_doc(book)
 258         meta_fields = self.extract_metadata(book, book_info)
 259         for f in meta_fields.values():
 260             if isinstance(f, list) or isinstance(f, tuple):
 261                 for elem in f:
 262                     book_doc.add(elem)
 263             else:
 264                 book_doc.add(f)
 265
 266         self.index.addDocument(book_doc)
 267         del book_doc
 268
 269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 270
 271     master_tags = [
 272         'opowiadanie',
 273         'powiesc',
 274         'dramat_wierszowany_l',
 275         'dramat_wierszowany_lp',
 276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 277         'wywiad',
 278         ]
 279
 280     ignore_content_tags = [
 281         'uwaga', 'extra',
 282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 283         'didaskalia',
 284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 285         ]
 286
 287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 288
 289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 290
 291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 292
 293     def extract_metadata(self, book, book_info=None):
 294         """
 295         Extract metadata from book and returns a map of fields keyed by fieldname
 296         """
 297         fields = {}
 298
 299         if book_info is None:
 300             book_info = dcparser.parse(open(book.xml_file.path))
 301
 302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 305
 306         # validator, name
 307         for field in dcparser.BookInfo.FIELDS:
 308             if hasattr(book_info, field.name):
 309                 if not getattr(book_info, field.name):
 310                     continue
 311                 # since no type information is available, we use validator
 312                 type_indicator = field.validator
 313                 if type_indicator == dcparser.as_unicode:
 314                     s = getattr(book_info, field.name)
 315                     if field.multiple:
 316                         s = ', '.join(s)
 317                     try:
 318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 319                     except JavaError as je:
 320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 321                 elif type_indicator == dcparser.as_person:
 322                     p = getattr(book_info, field.name)
 323                     if isinstance(p, dcparser.Person):
 324                         persons = unicode(p)
 325                     else:
 326                         persons = ', '.join(map(unicode, p))
 327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 328                 elif type_indicator == dcparser.as_date:
 329                     dt = getattr(book_info, field.name)
 330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 332
 333         # get published date
 334         pd = None
 335         if hasattr(book_info, 'source_name') and book_info.source_name:
 336             match = self.published_date_re.search(book_info.source_name)
 337             if match is not None:
 338                 pd = str(match.groups()[0])
 339         if not pd: pd = ""
 340         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 341
 342         return fields
 343
 344     def add_gaps(self, fields, fieldname):
 345         """
 346         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 347         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 348         """
 349         def gap():
 350             while True:
 351                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 352         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 353
 354     def get_master(self, root):
 355         """
 356         Returns the first master tag from an etree.
 357         """
 358         for master in root.iter():
 359             if master.tag in self.master_tags:
 360                 return master
 361
 362     def index_content(self, book, book_fields=[]):
 363         """
 364         Walks the book XML and extract content from it.
 365         Adds parts for each header tag and for each fragment.
 366         """
 367         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 368         root = wld.edoc.getroot()
 369
 370         master = self.get_master(root)
 371         if master is None:
 372             return []
 373
 374         def walker(node, ignore_tags=[]):
 375
 376             if node.tag not in ignore_tags:
 377                 yield node, None, None
 378                 if node.text is not None:
 379                     yield None, node.text, None
 380                 for child in list(node):
 381                     for b, t, e in walker(child):
 382                         yield b, t, e
 383                 yield None, None, node
 384
 385             if node.tail is not None:
 386                 yield None, node.tail, None
 387             return
 388
 389         def fix_format(text):
 390             #            separator = [u" ", u"\t", u".", u";", u","]
 391             if isinstance(text, list):
 392                 # need to join it first
 393                 text = filter(lambda s: s is not None, content)
 394                 text = u' '.join(text)
 395                 # for i in range(len(text)):
 396                 #     if i > 0:
 397                 #         if text[i][0] not in separator\
 398                 #             and text[i - 1][-1] not in separator:
 399                 #          text.insert(i, u" ")
 400
 401             return re.sub("(?m)/$", "", text)
 402
 403         def add_part(snippets, **fields):
 404             doc = self.create_book_doc(book)
 405             for f in book_fields:
 406                 doc.add(f)
 407
 408             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 409             doc.add(NumericField("header_span", Field.Store.YES, True)\
 410                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 411             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 412
 413             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 414                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 415
 416             snip_pos = snippets.add(fields["content"])
 417             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 418             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 419
 420             if 'fragment_anchor' in fields:
 421                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 422                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 423
 424             if 'themes' in fields:
 425                 themes, themes_pl = zip(*[
 426                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 427                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 428                      for theme in fields['themes']])
 429
 430                 themes = self.add_gaps(themes, 'themes')
 431                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 432
 433                 for t in themes:
 434                     doc.add(t)
 435                 for t in themes_pl:
 436                     doc.add(t)
 437
 438             return doc
 439
 440         def give_me_utf8(s):
 441             if isinstance(s, unicode):
 442                 return s.encode('utf-8')
 443             else:
 444                 return s
 445
 446         fragments = {}
 447         snippets = Snippets(book.id).open('w')
 448         try:
 449             for header, position in zip(list(master), range(len(master))):
 450
 451                 if header.tag in self.skip_header_tags:
 452                     continue
 453                 if header.tag is etree.Comment:
 454                     continue
 455
 456                 # section content
 457                 content = []
 458                 footnote = []
 459
 460                 def all_content(text):
 461                     for frag in fragments.values():
 462                         frag['content'].append(text)
 463                     content.append(text)
 464                 handle_text = [all_content]
 465
 466
 467                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 468                     # handle footnotes
 469                     if start is not None and start.tag in self.footnote_tags:
 470                         footnote = []
 471                         def collect_footnote(t):
 472                             footnote.append(t)
 473                         handle_text.append(collect_footnote)
 474                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 475                         handle_text.pop()
 476                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 477                                        content=u''.join(footnote),
 478                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 479
 480                         self.index.addDocument(doc)
 481                         #print "@ footnote text: %s" % footnote
 482                         footnote = []
 483
 484                     # handle fragments and themes.
 485                     if start is not None and start.tag == 'begin':
 486                         fid = start.attrib['id'][1:]
 487                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 488
 489                     # themes for this fragment
 490                     elif start is not None and start.tag == 'motyw':
 491                         fid = start.attrib['id'][1:]
 492                         handle_text.append(None)
 493                         if start.text is not None:
 494                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 495                     elif end is not None and end.tag == 'motyw':
 496                         handle_text.pop()
 497
 498                     elif start is not None and start.tag == 'end':
 499                         fid = start.attrib['id'][1:]
 500                         if fid not in fragments:
 501                             continue  # a broken <end> node, skip it
 502                         frag = fragments[fid]
 503                         if frag['themes'] == []:
 504                             continue  # empty themes list.
 505                         del fragments[fid]
 506
 507                         doc = add_part(snippets,
 508                                        header_type=frag['start_header'],
 509                                        header_index=frag['start_section'],
 510                                        header_span=position - frag['start_section'] + 1,
 511                                        fragment_anchor=fid,
 512                                        content=fix_format(frag['content']),
 513                                        themes=frag['themes'])
 514                         #print '@ FRAG %s' % frag['content']
 515                         self.index.addDocument(doc)
 516
 517                         # Collect content.
 518
 519                     if text is not None and handle_text is not []:
 520                         hdl = handle_text[-1]
 521                         if hdl is not None:
 522                             hdl(text)
 523
 524                         # in the end, add a section text.
 525                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 526                                content=fix_format(content))
 527                 #print '@ CONTENT: %s' % fix_format(content)
 528
 529                 self.index.addDocument(doc)
 530
 531         finally:
 532             snippets.close()
 533
 534
 535 def log_exception_wrapper(f):
 536     def _wrap(*a):
 537         try:
 538             f(*a)
 539         except Exception, e:
 540             print("Error in indexing thread: %s" % e)
 541             traceback.print_exc()
 542             raise e
 543     return _wrap
 544
 545
 546 class ReusableIndex(Index):
 547     """
 548     Works like index, but does not close/optimize Lucene index
 549     until program exit (uses atexit hook).
 550     This is usefull for importbooks command.
 551
 552     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 553     """
 554     index = None
 555
 556     def open(self, analyzer=None, threads=4):
 557         if ReusableIndex.index:
 558             self.index = ReusableIndex.index
 559         else:
 560             print("opening index")
 561             Index.open(self, analyzer)
 562             ReusableIndex.index = self.index
 563             atexit.register(ReusableIndex.close_reusable)
 564
 565     # def index_book(self, *args, **kw):
 566     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 567     #     ReusableIndex.pool_jobs.append(job)
 568
 569     @staticmethod
 570     def close_reusable():
 571         if ReusableIndex.index:
 572             print("closing index")
 573             ReusableIndex.index.optimize()
 574             ReusableIndex.index.close()
 575             ReusableIndex.index = None
 576
 577     def close(self):
 578         if ReusableIndex.index:
 579             ReusableIndex.index.commit()
 580
 581
 582 class JoinSearch(object):
 583     """
 584     This mixin could be used to handle block join queries.
 585     (currently unused)
 586     """
 587     def __init__(self, *args, **kw):
 588         super(JoinSearch, self).__init__(*args, **kw)
 589
 590     def wrapjoins(self, query, fields=[]):
 591         """
 592         This functions modifies the query in a recursive way,
 593         so Term and Phrase Queries contained, which match
 594         provided fields are wrapped in a BlockJoinQuery,
 595         and so delegated to children documents.
 596         """
 597         if BooleanQuery.instance_(query):
 598             qs = BooleanQuery.cast_(query)
 599             for clause in qs:
 600                 clause = BooleanClause.cast_(clause)
 601                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 602             return qs
 603         else:
 604             termset = HashSet()
 605             query.extractTerms(termset)
 606             for t in termset:
 607                 t = Term.cast_(t)
 608                 if t.field() not in fields:
 609                     return query
 610             return BlockJoinQuery(query, self.parent_filter,
 611                                   BlockJoinQuery.ScoreMode.Total)
 612
 613     def bsearch(self, query, max_results=50):
 614         q = self.query(query)
 615         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 616
 617         tops = self.searcher.search(bjq, max_results)
 618         bks = []
 619         for found in tops.scoreDocs:
 620             doc = self.searcher.doc(found.doc)
 621             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 622         return (bks, tops.totalHits)
 623
 624
 625 class SearchResult(object):
 626     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 627         if tokens_cache is None: tokens_cache = {}
 628
 629         if score:
 630             self._score = score
 631         else:
 632             self._score = scoreDocs.score
 633
 634         self.boost = 1.0
 635
 636         self._hits = []
 637         self._processed_hits = None  # processed hits
 638
 639         stored = search.searcher.doc(scoreDocs.doc)
 640         self.book_id = int(stored.get("book_id"))
 641
 642         pd = stored.get("published_date")
 643         if pd is None:
 644             pd = 0
 645         self.published_date = int(pd)
 646
 647         header_type = stored.get("header_type")
 648         # we have a content hit in some header of fragment
 649         if header_type is not None:
 650             sec = (header_type, int(stored.get("header_index")))
 651             header_span = stored.get('header_span')
 652             header_span = header_span is not None and int(header_span) or 1
 653
 654             fragment = stored.get("fragment_anchor")
 655
 656             if snippets:
 657                 snippets = snippets.replace("/\n", "\n")
 658             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 659
 660             self._hits.append(hit)
 661
 662         self.search = search
 663         self.searched = searched
 664         self.tokens_cache = tokens_cache
 665
 666     @property
 667     def score(self):
 668         return self._score * self.boost
 669
 670     def merge(self, other):
 671         if self.book_id != other.book_id:
 672             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 673         self._hits += other._hits
 674         if other.score > self.score:
 675             self._score = other._score
 676         return self
 677
 678     def get_book(self):
 679         return catalogue.models.Book.objects.get(id=self.book_id)
 680
 681     book = property(get_book)
 682
 683     @property
 684     def hits(self):
 685         if self._processed_hits is not None:
 686             return self._processed_hits
 687
 688         POSITION = 0
 689         FRAGMENT = 1
 690         POSITION_INDEX = 1
 691         POSITION_SPAN = 2
 692         SCORE = 2
 693         OTHER = 3
 694
 695         # to sections and fragments
 696         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 697         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 698         sect = filter(lambda s: 0 == len(filter(
 699             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 700             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 701             frags)), sect)
 702
 703         hits = []
 704
 705         # remove duplicate fragments
 706         fragments = {}
 707         for f in frags:
 708             fid = f[FRAGMENT]
 709             if fid in fragments:
 710                 if fragments[fid][SCORE] >= f[SCORE]:
 711                     continue
 712             fragments[fid] = f
 713         frags = fragments.values()
 714
 715         # remove duplicate sections
 716         sections = {}
 717
 718         for s in sect:
 719             si = s[POSITION][POSITION_INDEX]
 720             # skip existing
 721             if si in sections:
 722                 if sections[si]['score'] >= s[SCORE]:
 723                     continue
 724
 725             m = {'score': s[SCORE],
 726                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 727                  }
 728             m.update(s[OTHER])
 729             sections[si] = m
 730
 731         hits = sections.values()
 732
 733         for f in frags:
 734             try:
 735                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 736             except catalogue.models.Fragment.DoesNotExist:
 737                 # stale index
 738                 continue
 739
 740             # Figure out if we were searching for a token matching some word in theme name.
 741             themes = frag.tags.filter(category='theme')
 742             themes_hit = []
 743             if self.searched is not None:
 744                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 745                 for theme in themes:
 746                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 747                     for t in tokens:
 748                         if t in name_tokens:
 749                             if not theme in themes_hit:
 750                                 themes_hit.append(theme)
 751                             break
 752
 753             m = {'score': f[SCORE],
 754                  'fragment': frag,
 755                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 756                  'themes': themes,
 757                  'themes_hit': themes_hit
 758                  }
 759             m.update(f[OTHER])
 760             hits.append(m)
 761
 762         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 763
 764         self._processed_hits = hits
 765
 766         return hits
 767
 768     def __unicode__(self):
 769         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 770
 771     @staticmethod
 772     def aggregate(*result_lists):
 773         books = {}
 774         for rl in result_lists:
 775             for r in rl:
 776                 if r.book_id in books:
 777                     books[r.book_id].merge(r)
 778                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 779                 else:
 780                     books[r.book_id] = r
 781         return books.values()
 782
 783     def __cmp__(self, other):
 784         c = cmp(self.score, other.score)
 785         if c == 0:
 786             # this is inverted, because earlier date is better
 787             return cmp(other.published_date, self.published_date)
 788         else:
 789             return c
 790
 791
 792 class Hint(object):
 793     """
 794     Given some hint information (information we already know about)
 795     our search target - like author, title (specific book), epoch, genre, kind
 796     we can narrow down search using filters.
 797     """
 798     def __init__(self, search):
 799         """
 800         Accepts a Searcher instance.
 801         """
 802         self.search = search
 803         self.book_tags = {}
 804         self.part_tags = []
 805         self._books = []
 806
 807     def books(self, *books):
 808         """
 809         Give a hint that we search these books.
 810         """
 811         self._books = books
 812
 813     def tags(self, tags):
 814         """
 815         Give a hint that these Tag objects (a list of)
 816         is necessary.
 817         """
 818         for t in tags:
 819             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 820                 lst = self.book_tags.get(t.category, [])
 821                 lst.append(t)
 822                 self.book_tags[t.category] = lst
 823             if t.category in ['theme', 'theme_pl']:
 824                 self.part_tags.append(t)
 825
 826     def tag_filter(self, tags, field='tags'):
 827         """
 828         Given a lsit of tags and an optional field (but they are normally in tags field)
 829         returns a filter accepting only books with specific tags.
 830         """
 831         q = BooleanQuery()
 832
 833         for tag in tags:
 834             toks = self.search.get_tokens(tag.name, field=field)
 835             tag_phrase = PhraseQuery()
 836             for tok in toks:
 837                 tag_phrase.add(Term(field, tok))
 838             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 839
 840         return QueryWrapperFilter(q)
 841
 842     def book_filter(self):
 843         """
 844         Filters using book tags (all tag kinds except a theme)
 845         """
 846         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 847         if tags:
 848             return self.tag_filter(tags)
 849         else:
 850             return None
 851
 852     def part_filter(self):
 853         """
 854         This filter can be used to look for book parts.
 855         It filters on book id and/or themes.
 856         """
 857         fs = []
 858         if self.part_tags:
 859             fs.append(self.tag_filter(self.part_tags, field='themes'))
 860
 861         if self._books != []:
 862             bf = BooleanFilter()
 863             for b in self._books:
 864                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 865                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 866             fs.append(bf)
 867
 868         return Search.chain_filters(fs)
 869
 870     def should_search_for_book(self):
 871         return self._books == []
 872
 873     def just_search_in(self, all):
 874         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 875         some = []
 876         for field in all:
 877             if field == 'authors' and 'author' in self.book_tags:
 878                 continue
 879             if field == 'title' and self._books != []:
 880                 continue
 881             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 882                 continue
 883             some.append(field)
 884         return some
 885
 886
 887 class Search(IndexStore):
 888     """
 889     Search facilities.
 890     """
 891     def __init__(self, default_field="content"):
 892         IndexStore.__init__(self)
 893         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 894         # self.analyzer = WLAnalyzer()
 895         self.searcher = IndexSearcher(self.store, True)
 896         self.parser = QueryParser(Version.LUCENE_34, default_field,
 897                                   self.analyzer)
 898
 899         self.parent_filter = TermsFilter()
 900         self.parent_filter.addTerm(Term("is_book", "true"))
 901
 902     def query(self, query):
 903         """Parse query in default Lucene Syntax. (for humans)
 904         """
 905         return self.parser.parse(query)
 906
 907     def simple_search(self, query, max_results=50):
 908         """Runs a query for books using lucene syntax. (for humans)
 909         Returns (books, total_hits)
 910         """
 911
 912         tops = self.searcher.search(self.query(query), max_results)
 913         bks = []
 914         for found in tops.scoreDocs:
 915             doc = self.searcher.doc(found.doc)
 916             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 917         return (bks, tops.totalHits)
 918
 919     def get_tokens(self, searched, field='content', cached=None):
 920         """returns tokens analyzed by a proper (for a field) analyzer
 921         argument can be: StringReader, string/unicode, or tokens. In the last case
 922         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 923         """
 924         if cached is not None and field in cached:
 925             return cached[field]
 926
 927         if isinstance(searched, str) or isinstance(searched, unicode):
 928             searched = StringReader(searched)
 929         elif isinstance(searched, list):
 930             return searched
 931
 932         searched.reset()
 933         tokens = self.analyzer.reusableTokenStream(field, searched)
 934         toks = []
 935         while tokens.incrementToken():
 936             cta = tokens.getAttribute(CharTermAttribute.class_)
 937             toks.append(cta.toString())
 938
 939         if cached is not None:
 940             cached[field] = toks
 941
 942         return toks
 943
 944     def fuzziness(self, fuzzy):
 945         """Helper method to sanitize fuzziness"""
 946         if not fuzzy:
 947             return None
 948         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 949             return fuzzy
 950         else:
 951             return 0.5
 952
 953     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 954         """
 955         Return a PhraseQuery with a series of tokens.
 956         """
 957         if fuzzy:
 958             phrase = MultiPhraseQuery()
 959             for t in tokens:
 960                 term = Term(field, t)
 961                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 962                 fuzzterms = []
 963
 964                 while True:
 965                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 966                     ft = fuzzterm.term()
 967                     if ft:
 968                         fuzzterms.append(ft)
 969                     if not fuzzterm.next(): break
 970                 if fuzzterms:
 971                     phrase.add(JArray('object')(fuzzterms, Term))
 972                 else:
 973                     phrase.add(term)
 974         else:
 975             phrase = PhraseQuery()
 976             phrase.setSlop(slop)
 977             for t in tokens:
 978                 term = Term(field, t)
 979                 phrase.add(term)
 980         return phrase
 981
 982     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 983         """
 984         Returns term queries joined by boolean query.
 985         modal - applies to boolean query
 986         fuzzy - should the query by fuzzy.
 987         """
 988         q = BooleanQuery()
 989         for t in tokens:
 990             term = Term(field, t)
 991             if fuzzy:
 992                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 993             else:
 994                 term = TermQuery(term)
 995             q.add(BooleanClause(term, modal))
 996         return q
 997
 998     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 999                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1000         if filters is None: filters = []
1001         if tokens_cache is None: tokens_cache = {}
1002
1003         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1004
1005         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1006         if book:
1007             filters.append(self.term_filter(Term('is_book', 'true')))
1008         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1009
1010         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1011
1012     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1013                     filters=None, tokens_cache=None, boost=None, snippets=True):
1014         if filters is None: filters = []
1015         if tokens_cache is None: tokens_cache = {}
1016
1017         if book:
1018             filters.append(self.term_filter(Term('is_book', 'true')))
1019
1020         query = BooleanQuery()
1021
1022         for fld in fields:
1023             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1024
1025             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1026                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1027
1028         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1029
1030         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1031                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1032
1033     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1034         """
1035         Search for perfect book matches. Just see if the query matches with some author or title,
1036         taking hints into account.
1037         """
1038         fields_to_search = ['authors', 'title']
1039         only_in = None
1040         if hint:
1041             if not hint.should_search_for_book():
1042                 return []
1043             fields_to_search = hint.just_search_in(fields_to_search)
1044             only_in = hint.book_filter()
1045
1046         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1047
1048         books = []
1049         for q in qrys:
1050             top = self.searcher.search(q,
1051                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1052                 max_results)
1053             for found in top.scoreDocs:
1054                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1055         return books
1056
1057     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1058         fields_to_search = ['tags', 'authors', 'title']
1059
1060         only_in = None
1061         if hint:
1062             if not hint.should_search_for_book():
1063                 return []
1064             fields_to_search = hint.just_search_in(fields_to_search)
1065             only_in = hint.book_filter()
1066
1067         tokens = self.get_tokens(searched, field='SIMPLE')
1068
1069         q = BooleanQuery()
1070
1071         for fld in fields_to_search:
1072             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1073                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1074
1075         books = []
1076         top = self.searcher.search(q,
1077                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1078             max_results)
1079         for found in top.scoreDocs:
1080             books.append(SearchResult(self, found, how_found="search_book"))
1081
1082         return books
1083
1084     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1085         """
1086         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1087         some part/fragment of the book.
1088         """
1089         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1090
1091         flt = None
1092         if hint:
1093             flt = hint.part_filter()
1094
1095         books = []
1096         for q in qrys:
1097             top = self.searcher.search(q,
1098                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1099                                                            flt]),
1100                                        max_results)
1101             for found in top.scoreDocs:
1102                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1103
1104         return books
1105
1106     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1107         """
1108         Tries to use search terms to match different fields of book (or its parts).
1109         E.g. one word can be an author survey, another be a part of the title, and the rest
1110         are some words from third chapter.
1111         """
1112         if tokens_cache is None: tokens_cache = {}
1113         books = []
1114         only_in = None
1115
1116         if hint:
1117             only_in = hint.part_filter()
1118
1119         # content only query : themes x content
1120         q = BooleanQuery()
1121
1122         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1123         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1124
1125         # only search in themes when we do not already filter by themes
1126         if hint is None or hint.just_search_in(['themes']) != []:
1127             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1128                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1129
1130         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1131                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1132
1133         topDocs = self.searcher.search(q, only_in, max_results)
1134         for found in topDocs.scoreDocs:
1135             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1136             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1137
1138         # query themes/content x author/title/tags
1139         q = BooleanQuery()
1140         in_content = BooleanQuery()
1141         in_meta = BooleanQuery()
1142
1143         for fld in ['themes_pl', 'content']:
1144             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1145
1146         for fld in ['tags', 'authors', 'title']:
1147             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1148
1149         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1150         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1151
1152         topDocs = self.searcher.search(q, only_in, max_results)
1153         for found in topDocs.scoreDocs:
1154             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1155             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1156
1157         return books
1158
1159     # def multisearch(self, query, max_results=50):
1160     #     """
1161     #     Search strategy:
1162     #     - (phrase) OR -> content
1163     #                   -> title
1164     #                   -> authors
1165     #     - (keywords)  -> authors
1166     #                   -> motyw
1167     #                   -> tags
1168     #                   -> content
1169     #     """
1170         # queryreader = StringReader(query)
1171         # tokens = self.get_tokens(queryreader)
1172
1173         # top_level = BooleanQuery()
1174         # Should = BooleanClause.Occur.SHOULD
1175
1176         # phrase_level = BooleanQuery()
1177         # phrase_level.setBoost(1.3)
1178
1179         # p_content = self.make_phrase(tokens, joined=True)
1180         # p_title = self.make_phrase(tokens, 'title')
1181         # p_author = self.make_phrase(tokens, 'author')
1182
1183         # phrase_level.add(BooleanClause(p_content, Should))
1184         # phrase_level.add(BooleanClause(p_title, Should))
1185         # phrase_level.add(BooleanClause(p_author, Should))
1186
1187         # kw_level = BooleanQuery()
1188
1189         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1190         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1191         # kw_level.add(j_themes, Should)
1192         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1193         # j_con = self.make_term_query(tokens, joined=True)
1194         # kw_level.add(j_con, Should)
1195
1196         # top_level.add(BooleanClause(phrase_level, Should))
1197         # top_level.add(BooleanClause(kw_level, Should))
1198
1199         # return None
1200
1201     def get_snippets(self, scoreDoc, query, field='content'):
1202         """
1203         Returns a snippet for found scoreDoc.
1204         """
1205         htmlFormatter = SimpleHTMLFormatter()
1206         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1207
1208         stored = self.searcher.doc(scoreDoc.doc)
1209
1210         position = stored.get('snippets_position')
1211         length = stored.get('snippets_length')
1212         if position is None or length is None:
1213             return None
1214         # locate content.
1215         snippets = Snippets(stored.get('book_id')).open()
1216         try:
1217             text = snippets.get((int(position),
1218                                  int(length)))
1219         finally:
1220             snippets.close()
1221
1222         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1223         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1224         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1225
1226         return snip
1227
1228     @staticmethod
1229     def enum_to_array(enum):
1230         """
1231         Converts a lucene TermEnum to array of Terms, suitable for
1232         addition to queries
1233         """
1234         terms = []
1235
1236         while True:
1237             t = enum.term()
1238             if t:
1239                 terms.append(t)
1240             if not enum.next(): break
1241
1242         if terms:
1243             return JArray('object')(terms, Term)
1244
1245     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1246         """
1247         Search for Tag objects using query.
1248         """
1249         if not pdcounter:
1250             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1251         tops = self.searcher.search(query, filters, max_results)
1252
1253         tags = []
1254         for found in tops.scoreDocs:
1255             doc = self.searcher.doc(found.doc)
1256             is_pdcounter = doc.get('is_pdcounter')
1257             if is_pdcounter:
1258                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1259             else:
1260                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1261                 # don't add the pdcounter tag if same tag already exists
1262             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1263                 tags.append(tag)
1264                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1265         print 'returning %s' % tags
1266         return tags
1267
1268     def search_books(self, query, filter=None, max_results=10):
1269         """
1270         Searches for Book objects using query
1271         """
1272         bks = []
1273         tops = self.searcher.search(query, filter, max_results)
1274         for found in tops.scoreDocs:
1275             doc = self.searcher.doc(found.doc)
1276             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1277         return bks
1278
1279     def make_prefix_phrase(self, toks, field):
1280         q = MultiPhraseQuery()
1281         for i in range(len(toks)):
1282             t = Term(field, toks[i])
1283             if i == len(toks) - 1:
1284                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1285                 if pterms:
1286                     q.add(pterms)
1287                 else:
1288                     q.add(t)
1289             else:
1290                 q.add(t)
1291         return q
1292
1293     @staticmethod
1294     def term_filter(term, inverse=False):
1295         only_term = TermsFilter()
1296         only_term.addTerm(term)
1297
1298         if inverse:
1299             neg = BooleanFilter()
1300             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1301             only_term = neg
1302
1303         return only_term
1304
1305     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1306         """
1307         Return auto-complete hints for tags
1308         using prefix search.
1309         """
1310         toks = self.get_tokens(string, field='SIMPLE')
1311         top = BooleanQuery()
1312
1313         for field in ['tag_name', 'tag_name_pl']:
1314             if prefix:
1315                 q = self.make_prefix_phrase(toks, field)
1316             else:
1317                 q = self.make_term_query(toks, field)
1318             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1319
1320         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1321
1322         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1323
1324     def hint_books(self, string, max_results=50, prefix=True):
1325         """
1326         Returns auto-complete hints for book titles
1327         Because we do not index 'pseudo' title-tags.
1328         Prefix search.
1329         """
1330         toks = self.get_tokens(string, field='SIMPLE')
1331
1332         if prefix:
1333             q = self.make_prefix_phrase(toks, 'title')
1334         else:
1335             q = self.make_term_query(toks, 'title')
1336
1337         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1338
1339     @staticmethod
1340     def chain_filters(filters, op=ChainedFilter.AND):
1341         """
1342         Chains a filter list together
1343         """
1344         filters = filter(lambda x: x is not None, filters)
1345         if not filters or filters is []:
1346             return None
1347         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1348         return chf
1349
1350     def filtered_categories(self, tags):
1351         """
1352         Return a list of tag categories, present in tags list.
1353         """
1354         cats = {}
1355         for t in tags:
1356             cats[t.category] = True
1357         return cats.keys()
1358
1359     def hint(self):
1360         return Hint(self)