apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, analyzer=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         self.index = IndexWriter(self.store, self.analyzer,\
 176                                  IndexWriter.MaxFieldLength.LIMITED)
 177         return self.index
 178
 179     def optimize(self):
 180         self.index.optimize()
 181
 182     def close(self):
 183         try:
 184             self.index.optimize()
 185         except JavaError, je:
 186             print "Error during optimize phase, check index: %s" % je
 187
 188         self.index.close()
 189         self.index = None
 190
 191     def __enter__(self):
 192         self.open()
 193         return self
 194
 195     def __exit__(self, type, value, tb):
 196         self.close()
 197
 198
 199 class Index(BaseIndex):
 200     """
 201     Class indexing books.
 202     """
 203     def __init__(self, analyzer=None):
 204         super(Index, self).__init__(analyzer)
 205
 206     def index_tags(self):
 207         """
 208         Re-index global tag list.
 209         Removes all tags from index, then index them again.
 210         Indexed fields include: id, name (with and without polish stems), category
 211         """
 212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 213         self.index.deleteDocuments(q)
 214
 215         for tag in catalogue.models.Tag.objects.all():
 216             doc = Document()
 217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 221             self.index.addDocument(doc)
 222
 223         for pdtag in PDCounterAuthor.objects.all():
 224             doc = Document()
 225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
 229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 230             self.index.addDocument(doc)
 231
 232     def create_book_doc(self, book):
 233         """
 234         Create a lucene document referring book id.
 235         """
 236         doc = Document()
 237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 238         if book.parent is not None:
 239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 240         return doc
 241
 242     def remove_book(self, book):
 243         """Removes a book from search index.
 244         book - Book instance."""
 245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 246         self.index.deleteDocuments(q)
 247
 248     def index_book(self, book, book_info=None, overwrite=True):
 249         """
 250         Indexes the book.
 251         Creates a lucene document for extracted metadata
 252         and calls self.index_content() to index the contents of the book.
 253         """
 254         if overwrite:
 255             self.remove_book(book)
 256
 257         book_doc = self.create_book_doc(book)
 258         meta_fields = self.extract_metadata(book, book_info)
 259         for f in meta_fields.values():
 260             if isinstance(f, list) or isinstance(f, tuple):
 261                 for elem in f:
 262                     book_doc.add(elem)
 263             else:
 264                 book_doc.add(f)
 265
 266         self.index.addDocument(book_doc)
 267         del book_doc
 268
 269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 270
 271     master_tags = [
 272         'opowiadanie',
 273         'powiesc',
 274         'dramat_wierszowany_l',
 275         'dramat_wierszowany_lp',
 276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 277         'wywiad',
 278         ]
 279
 280     ignore_content_tags = [
 281         'uwaga', 'extra',
 282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 283         'didaskalia',
 284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 285         ]
 286
 287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 288
 289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 290
 291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 292
 293     def extract_metadata(self, book, book_info=None):
 294         """
 295         Extract metadata from book and returns a map of fields keyed by fieldname
 296         """
 297         fields = {}
 298
 299         if book_info is None:
 300             book_info = dcparser.parse(open(book.xml_file.path))
 301
 302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 305
 306         # validator, name
 307         for field in dcparser.BookInfo.FIELDS:
 308             if hasattr(book_info, field.name):
 309                 if not getattr(book_info, field.name):
 310                     continue
 311                 # since no type information is available, we use validator
 312                 type_indicator = field.validator
 313                 if type_indicator == dcparser.as_unicode:
 314                     s = getattr(book_info, field.name)
 315                     if field.multiple:
 316                         s = ', '.join(s)
 317                     try:
 318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 319                     except JavaError as je:
 320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 321                 elif type_indicator == dcparser.as_person:
 322                     p = getattr(book_info, field.name)
 323                     if isinstance(p, dcparser.Person):
 324                         persons = unicode(p)
 325                     else:
 326                         persons = ', '.join(map(unicode, p))
 327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 328                 elif type_indicator == dcparser.as_date:
 329                     dt = getattr(book_info, field.name)
 330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 332
 333         # get published date
 334         source = book_info.source_name
 335         match = self.published_date_re.search(source)
 336         if match is not None:
 337             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
 338
 339         return fields
 340
 341     def add_gaps(self, fields, fieldname):
 342         """
 343         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 344         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 345         """
 346         def gap():
 347             while True:
 348                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 349         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 350
 351     def get_master(self, root):
 352         """
 353         Returns the first master tag from an etree.
 354         """
 355         for master in root.iter():
 356             if master.tag in self.master_tags:
 357                 return master
 358
 359     def index_content(self, book, book_fields=[]):
 360         """
 361         Walks the book XML and extract content from it.
 362         Adds parts for each header tag and for each fragment.
 363         """
 364         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 365         root = wld.edoc.getroot()
 366
 367         master = self.get_master(root)
 368         if master is None:
 369             return []
 370
 371         def walker(node, ignore_tags=[]):
 372             yield node, None
 373             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
 374                 for b, e in walker(child):
 375                     yield b, e
 376             yield None, node
 377             return
 378
 379         def fix_format(text):
 380             #            separator = [u" ", u"\t", u".", u";", u","]
 381             if isinstance(text, list):
 382                 # need to join it first
 383                 text = filter(lambda s: s is not None, content)
 384                 text = u' '.join(text)
 385                 # for i in range(len(text)):
 386                 #     if i > 0:
 387                 #         if text[i][0] not in separator\
 388                 #             and text[i - 1][-1] not in separator:
 389                 #          text.insert(i, u" ")
 390
 391             return re.sub("(?m)/$", "", text)
 392
 393         def add_part(snippets, **fields):
 394             doc = self.create_book_doc(book)
 395             for f in book_fields:
 396                 doc.add(f)
 397
 398             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 399             doc.add(NumericField("header_span", Field.Store.YES, True)\
 400                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 401             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 402
 403             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 404                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 405
 406             snip_pos = snippets.add(fields["content"])
 407             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 408             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 409
 410             if 'fragment_anchor' in fields:
 411                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 412                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 413
 414             if 'themes' in fields:
 415                 themes, themes_pl = zip(*[
 416                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 417                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 418                      for theme in fields['themes']])
 419
 420                 themes = self.add_gaps(themes, 'themes')
 421                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 422
 423                 for t in themes:
 424                     doc.add(t)
 425                 for t in themes_pl:
 426                     doc.add(t)
 427
 428             return doc
 429
 430         def give_me_utf8(s):
 431             if isinstance(s, unicode):
 432                 return s.encode('utf-8')
 433             else:
 434                 return s
 435
 436         fragments = {}
 437         snippets = Snippets(book.id).open('w')
 438         try:
 439             for header, position in zip(list(master), range(len(master))):
 440
 441                 if header.tag in self.skip_header_tags:
 442                     continue
 443                 if header.tag is etree.Comment:
 444                     continue
 445
 446                 # section content
 447                 content = []
 448                 footnote = None
 449
 450                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
 451                     # handle footnotes
 452                     # if start is not None and start.tag in self.footnote_tags:
 453                     #     footnote = ' '.join(start.itertext())
 454                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
 455                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
 456                     #                    content=footnote)
 457
 458                     #     self.index.addDocument(doc)
 459
 460                     #     footnote = None
 461
 462                     # handle fragments and themes.
 463                     if start is not None and start.tag == 'begin':
 464                         fid = start.attrib['id'][1:]
 465                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 466
 467                     elif start is not None and start.tag == 'motyw':
 468                         fid = start.attrib['id'][1:]
 469                         if start.text is not None:
 470                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 471
 472                     elif start is not None and start.tag == 'end':
 473                         fid = start.attrib['id'][1:]
 474                         if fid not in fragments:
 475                             continue  # a broken <end> node, skip it
 476                                       #                        import pdb; pdb.set_trace()
 477                         frag = fragments[fid]
 478                         if frag['themes'] == []:
 479                             continue  # empty themes list.
 480                         del fragments[fid]
 481
 482                         doc = add_part(snippets,
 483                                        header_type=frag['start_header'],
 484                                        header_index=frag['start_section'],
 485                                        header_span=position - frag['start_section'] + 1,
 486                                        fragment_anchor=fid,
 487                                        content=fix_format(frag['content']),
 488                                        themes=frag['themes'])
 489
 490                         self.index.addDocument(doc)
 491
 492                         # Collect content.
 493                     elif start is not None:
 494                         for frag in fragments.values():
 495                             frag['content'].append(start.text)
 496                         content.append(start.text)
 497                     elif end is not None:
 498                         for frag in fragments.values():
 499                             frag['content'].append(end.tail)
 500                         content.append(end.tail)
 501
 502                         # in the end, add a section text.
 503                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 504                                content=fix_format(content))
 505
 506                 self.index.addDocument(doc)
 507
 508         finally:
 509             snippets.close()
 510
 511
 512 def log_exception_wrapper(f):
 513     def _wrap(*a):
 514         try:
 515             f(*a)
 516         except Exception, e:
 517             print("Error in indexing thread: %s" % e)
 518             traceback.print_exc()
 519             raise e
 520     return _wrap
 521
 522
 523 class ReusableIndex(Index):
 524     """
 525     Works like index, but does not close/optimize Lucene index
 526     until program exit (uses atexit hook).
 527     This is usefull for importbooks command.
 528
 529     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 530     """
 531     index = None
 532
 533     def open(self, analyzer=None, threads=4):
 534         if ReusableIndex.index is not None:
 535             self.index = ReusableIndex.index
 536         else:
 537             print("opening index")
 538             Index.open(self, analyzer)
 539             ReusableIndex.index = self.index
 540             atexit.register(ReusableIndex.close_reusable)
 541
 542     # def index_book(self, *args, **kw):
 543     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 544     #     ReusableIndex.pool_jobs.append(job)
 545
 546     @staticmethod
 547     def close_reusable():
 548         if ReusableIndex.index is not None:
 549             ReusableIndex.index.optimize()
 550             ReusableIndex.index.close()
 551             ReusableIndex.index = None
 552
 553     def close(self):
 554         pass
 555
 556
 557 class JoinSearch(object):
 558     """
 559     This mixin could be used to handle block join queries.
 560     (currently unused)
 561     """
 562     def __init__(self, *args, **kw):
 563         super(JoinSearch, self).__init__(*args, **kw)
 564
 565     def wrapjoins(self, query, fields=[]):
 566         """
 567         This functions modifies the query in a recursive way,
 568         so Term and Phrase Queries contained, which match
 569         provided fields are wrapped in a BlockJoinQuery,
 570         and so delegated to children documents.
 571         """
 572         if BooleanQuery.instance_(query):
 573             qs = BooleanQuery.cast_(query)
 574             for clause in qs:
 575                 clause = BooleanClause.cast_(clause)
 576                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 577             return qs
 578         else:
 579             termset = HashSet()
 580             query.extractTerms(termset)
 581             for t in termset:
 582                 t = Term.cast_(t)
 583                 if t.field() not in fields:
 584                     return query
 585             return BlockJoinQuery(query, self.parent_filter,
 586                                   BlockJoinQuery.ScoreMode.Total)
 587
 588     def bsearch(self, query, max_results=50):
 589         q = self.query(query)
 590         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 591
 592         tops = self.searcher.search(bjq, max_results)
 593         bks = []
 594         for found in tops.scoreDocs:
 595             doc = self.searcher.doc(found.doc)
 596             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 597         return (bks, tops.totalHits)
 598
 599
 600 class SearchResult(object):
 601     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 602         if tokens_cache is None: tokens_cache = {}
 603
 604         if score:
 605             self._score = score
 606         else:
 607             self._score = scoreDocs.score
 608
 609         self.boost = 1.0
 610
 611         self._hits = []
 612         self._processed_hits = None  # processed hits
 613
 614         stored = search.searcher.doc(scoreDocs.doc)
 615         self.book_id = int(stored.get("book_id"))
 616
 617         pd = stored.get("published_date")
 618         if pd is None:
 619             pd = 0
 620         self.published_date = int(pd)
 621
 622         header_type = stored.get("header_type")
 623         # we have a content hit in some header of fragment
 624         if header_type is not None:
 625             sec = (header_type, int(stored.get("header_index")))
 626             header_span = stored.get('header_span')
 627             header_span = header_span is not None and int(header_span) or 1
 628
 629             fragment = stored.get("fragment_anchor")
 630
 631             if snippets:
 632                 snippets = snippets.replace("/\n", "\n")
 633             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 634
 635             self._hits.append(hit)
 636
 637         self.search = search
 638         self.searched = searched
 639         self.tokens_cache = tokens_cache
 640
 641     @property
 642     def score(self):
 643         return self._score * self.boost
 644
 645     def merge(self, other):
 646         if self.book_id != other.book_id:
 647             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 648         self._hits += other._hits
 649         if other.score > self.score:
 650             self._score = other._score
 651         return self
 652
 653     def get_book(self):
 654         return catalogue.models.Book.objects.get(id=self.book_id)
 655
 656     book = property(get_book)
 657
 658     @property
 659     def hits(self):
 660         if self._processed_hits is not None:
 661             return self._processed_hits
 662
 663         POSITION = 0
 664         FRAGMENT = 1
 665         POSITION_INDEX = 1
 666         POSITION_SPAN = 2
 667         SCORE = 2
 668         OTHER = 3
 669
 670         # to sections and fragments
 671         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 672         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 673         sect = filter(lambda s: 0 == len(filter(
 674             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 675             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 676             frags)), sect)
 677
 678         hits = []
 679
 680         # remove duplicate fragments
 681         fragments = {}
 682         for f in frags:
 683             fid = f[FRAGMENT]
 684             if fid in fragments:
 685                 if fragments[fid][SCORE] >= f[SCORE]:
 686                     continue
 687             fragments[fid] = f
 688         frags = fragments.values()
 689
 690         # remove duplicate sections
 691         sections = {}
 692
 693         for s in sect:
 694             si = s[POSITION][POSITION_INDEX]
 695             # skip existing
 696             if si in sections:
 697                 if sections[si]['score'] >= s[SCORE]:
 698                     continue
 699
 700             m = {'score': s[SCORE],
 701                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 702                  }
 703             m.update(s[OTHER])
 704             sections[si] = m
 705
 706         hits = sections.values()
 707
 708         for f in frags:
 709             try:
 710                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 711             except catalogue.models.Fragment.DoesNotExist:
 712                 # stale index
 713                 continue
 714
 715             # Figure out if we were searching for a token matching some word in theme name.
 716             themes = frag.tags.filter(category='theme')
 717             themes_hit = []
 718             if self.searched is not None:
 719                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 720                 for theme in themes:
 721                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 722                     for t in tokens:
 723                         if t in name_tokens:
 724                             if not theme in themes_hit:
 725                                 themes_hit.append(theme)
 726                             break
 727
 728             m = {'score': f[SCORE],
 729                  'fragment': frag,
 730                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 731                  'themes': themes,
 732                  'themes_hit': themes_hit
 733                  }
 734             m.update(f[OTHER])
 735             hits.append(m)
 736
 737         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 738
 739         self._processed_hits = hits
 740
 741         return hits
 742
 743     def __unicode__(self):
 744         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 745
 746     @staticmethod
 747     def aggregate(*result_lists):
 748         books = {}
 749         for rl in result_lists:
 750             for r in rl:
 751                 if r.book_id in books:
 752                     books[r.book_id].merge(r)
 753                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 754                 else:
 755                     books[r.book_id] = r
 756         return books.values()
 757
 758     def __cmp__(self, other):
 759         c = cmp(self.score, other.score)
 760         if c == 0:
 761             if not hasattr(other,'published_date') or not hasattr(self, 'published_date'):
 762                 import pdb; pdb.set_trace()
 763             # this is inverted, because earlier date is better
 764             return cmp(other.published_date, self.published_date)
 765         else:
 766             return c
 767
 768
 769 class Hint(object):
 770     """
 771     Given some hint information (information we already know about)
 772     our search target - like author, title (specific book), epoch, genre, kind
 773     we can narrow down search using filters.
 774     """
 775     def __init__(self, search):
 776         """
 777         Accepts a Searcher instance.
 778         """
 779         self.search = search
 780         self.book_tags = {}
 781         self.part_tags = []
 782         self._books = []
 783
 784     def books(self, *books):
 785         """
 786         Give a hint that we search these books.
 787         """
 788         self._books = books
 789
 790     def tags(self, tags):
 791         """
 792         Give a hint that these Tag objects (a list of)
 793         is necessary.
 794         """
 795         for t in tags:
 796             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 797                 lst = self.book_tags.get(t.category, [])
 798                 lst.append(t)
 799                 self.book_tags[t.category] = lst
 800             if t.category in ['theme', 'theme_pl']:
 801                 self.part_tags.append(t)
 802
 803     def tag_filter(self, tags, field='tags'):
 804         """
 805         Given a lsit of tags and an optional field (but they are normally in tags field)
 806         returns a filter accepting only books with specific tags.
 807         """
 808         q = BooleanQuery()
 809
 810         for tag in tags:
 811             toks = self.search.get_tokens(tag.name, field=field)
 812             tag_phrase = PhraseQuery()
 813             for tok in toks:
 814                 tag_phrase.add(Term(field, tok))
 815             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 816
 817         return QueryWrapperFilter(q)
 818
 819     def book_filter(self):
 820         """
 821         Filters using book tags (all tag kinds except a theme)
 822         """
 823         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 824         if tags:
 825             return self.tag_filter(tags)
 826         else:
 827             return None
 828
 829     def part_filter(self):
 830         """
 831         This filter can be used to look for book parts.
 832         It filters on book id and/or themes.
 833         """
 834         fs = []
 835         if self.part_tags:
 836             fs.append(self.tag_filter(self.part_tags, field='themes'))
 837
 838         if self._books != []:
 839             bf = BooleanFilter()
 840             for b in self._books:
 841                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 842                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 843             fs.append(bf)
 844
 845         return Search.chain_filters(fs)
 846
 847     def should_search_for_book(self):
 848         return self._books == []
 849
 850     def just_search_in(self, all):
 851         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 852         some = []
 853         for field in all:
 854             if field == 'authors' and 'author' in self.book_tags:
 855                 continue
 856             if field == 'title' and self._books != []:
 857                 continue
 858             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 859                 continue
 860             some.append(field)
 861         return some
 862
 863
 864 class Search(IndexStore):
 865     """
 866     Search facilities.
 867     """
 868     def __init__(self, default_field="content"):
 869         IndexStore.__init__(self)
 870         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 871         # self.analyzer = WLAnalyzer()
 872         self.searcher = IndexSearcher(self.store, True)
 873         self.parser = QueryParser(Version.LUCENE_34, default_field,
 874                                   self.analyzer)
 875
 876         self.parent_filter = TermsFilter()
 877         self.parent_filter.addTerm(Term("is_book", "true"))
 878
 879     def query(self, query):
 880         """Parse query in default Lucene Syntax. (for humans)
 881         """
 882         return self.parser.parse(query)
 883
 884     def simple_search(self, query, max_results=50):
 885         """Runs a query for books using lucene syntax. (for humans)
 886         Returns (books, total_hits)
 887         """
 888
 889         tops = self.searcher.search(self.query(query), max_results)
 890         bks = []
 891         for found in tops.scoreDocs:
 892             doc = self.searcher.doc(found.doc)
 893             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 894         return (bks, tops.totalHits)
 895
 896     def get_tokens(self, searched, field='content', cached=None):
 897         """returns tokens analyzed by a proper (for a field) analyzer
 898         argument can be: StringReader, string/unicode, or tokens. In the last case
 899         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 900         """
 901         if cached is not None and field in cached:
 902             return cached[field]
 903
 904         if isinstance(searched, str) or isinstance(searched, unicode):
 905             searched = StringReader(searched)
 906         elif isinstance(searched, list):
 907             return searched
 908
 909         searched.reset()
 910         tokens = self.analyzer.reusableTokenStream(field, searched)
 911         toks = []
 912         while tokens.incrementToken():
 913             cta = tokens.getAttribute(CharTermAttribute.class_)
 914             toks.append(cta.toString())
 915
 916         if cached is not None:
 917             cached[field] = toks
 918
 919         return toks
 920
 921     def fuzziness(self, fuzzy):
 922         """Helper method to sanitize fuzziness"""
 923         if not fuzzy:
 924             return None
 925         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 926             return fuzzy
 927         else:
 928             return 0.5
 929
 930     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 931         """
 932         Return a PhraseQuery with a series of tokens.
 933         """
 934         if fuzzy:
 935             phrase = MultiPhraseQuery()
 936             for t in tokens:
 937                 term = Term(field, t)
 938                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 939                 fuzzterms = []
 940
 941                 while True:
 942                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 943                     ft = fuzzterm.term()
 944                     if ft:
 945                         fuzzterms.append(ft)
 946                     if not fuzzterm.next(): break
 947                 if fuzzterms:
 948                     phrase.add(JArray('object')(fuzzterms, Term))
 949                 else:
 950                     phrase.add(term)
 951         else:
 952             phrase = PhraseQuery()
 953             phrase.setSlop(slop)
 954             for t in tokens:
 955                 term = Term(field, t)
 956                 phrase.add(term)
 957         return phrase
 958
 959     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 960         """
 961         Returns term queries joined by boolean query.
 962         modal - applies to boolean query
 963         fuzzy - should the query by fuzzy.
 964         """
 965         q = BooleanQuery()
 966         for t in tokens:
 967             term = Term(field, t)
 968             if fuzzy:
 969                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 970             else:
 971                 term = TermQuery(term)
 972             q.add(BooleanClause(term, modal))
 973         return q
 974
 975     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 976                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 977         if filters is None: filters = []
 978         if tokens_cache is None: tokens_cache = {}
 979
 980         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 981
 982         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 983         if book:
 984             filters.append(self.term_filter(Term('is_book', 'true')))
 985         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 986
 987         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 988
 989     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 990                     filters=None, tokens_cache=None, boost=None, snippets=True):
 991         if filters is None: filters = []
 992         if tokens_cache is None: tokens_cache = {}
 993
 994         if book:
 995             filters.append(self.term_filter(Term('is_book', 'true')))
 996
 997         query = BooleanQuery()
 998
 999         for fld in fields:
1000             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1001
1002             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1003                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1004
1005         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1006
1007         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1008                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1009
1010     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1011         """
1012         Search for perfect book matches. Just see if the query matches with some author or title,
1013         taking hints into account.
1014         """
1015         fields_to_search = ['authors', 'title']
1016         only_in = None
1017         if hint:
1018             if not hint.should_search_for_book():
1019                 return []
1020             fields_to_search = hint.just_search_in(fields_to_search)
1021             only_in = hint.book_filter()
1022
1023         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1024
1025         books = []
1026         for q in qrys:
1027             top = self.searcher.search(q,
1028                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1029                 max_results)
1030             for found in top.scoreDocs:
1031                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1032         return books
1033
1034     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1035         fields_to_search = ['tags', 'authors', 'title']
1036
1037         only_in = None
1038         if hint:
1039             if not hint.should_search_for_book():
1040                 return []
1041             fields_to_search = hint.just_search_in(fields_to_search)
1042             only_in = hint.book_filter()
1043
1044         tokens = self.get_tokens(searched, field='SIMPLE')
1045
1046         q = BooleanQuery()
1047
1048         for fld in fields_to_search:
1049             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1050                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1051
1052         books = []
1053         top = self.searcher.search(q,
1054                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1055             max_results)
1056         for found in top.scoreDocs:
1057             books.append(SearchResult(self, found, how_found="search_book"))
1058
1059         return books
1060
1061     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1062         """
1063         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1064         some part/fragment of the book.
1065         """
1066         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1067
1068         flt = None
1069         if hint:
1070             flt = hint.part_filter()
1071
1072         books = []
1073         for q in qrys:
1074             top = self.searcher.search(q,
1075                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1076                                                            flt]),
1077                                        max_results)
1078             for found in top.scoreDocs:
1079                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1080
1081         return books
1082
1083     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1084         """
1085         Tries to use search terms to match different fields of book (or its parts).
1086         E.g. one word can be an author survey, another be a part of the title, and the rest
1087         are some words from third chapter.
1088         """
1089         if tokens_cache is None: tokens_cache = {}
1090         books = []
1091         only_in = None
1092
1093         if hint:
1094             only_in = hint.part_filter()
1095
1096         # content only query : themes x content
1097         q = BooleanQuery()
1098
1099         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1100         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1101
1102         # only search in themes when we do not already filter by themes
1103         if hint is None or hint.just_search_in(['themes']) != []:
1104             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1105                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1106
1107         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1108                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1109
1110         topDocs = self.searcher.search(q, only_in, max_results)
1111         for found in topDocs.scoreDocs:
1112             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1113             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1114
1115         # query themes/content x author/title/tags
1116         q = BooleanQuery()
1117         in_content = BooleanQuery()
1118         in_meta = BooleanQuery()
1119
1120         for fld in ['themes_pl', 'content']:
1121             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1122
1123         for fld in ['tags', 'authors', 'title']:
1124             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1125
1126         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1127         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1128
1129         topDocs = self.searcher.search(q, only_in, max_results)
1130         for found in topDocs.scoreDocs:
1131             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1132             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1133
1134         return books
1135
1136     # def multisearch(self, query, max_results=50):
1137     #     """
1138     #     Search strategy:
1139     #     - (phrase) OR -> content
1140     #                   -> title
1141     #                   -> authors
1142     #     - (keywords)  -> authors
1143     #                   -> motyw
1144     #                   -> tags
1145     #                   -> content
1146     #     """
1147         # queryreader = StringReader(query)
1148         # tokens = self.get_tokens(queryreader)
1149
1150         # top_level = BooleanQuery()
1151         # Should = BooleanClause.Occur.SHOULD
1152
1153         # phrase_level = BooleanQuery()
1154         # phrase_level.setBoost(1.3)
1155
1156         # p_content = self.make_phrase(tokens, joined=True)
1157         # p_title = self.make_phrase(tokens, 'title')
1158         # p_author = self.make_phrase(tokens, 'author')
1159
1160         # phrase_level.add(BooleanClause(p_content, Should))
1161         # phrase_level.add(BooleanClause(p_title, Should))
1162         # phrase_level.add(BooleanClause(p_author, Should))
1163
1164         # kw_level = BooleanQuery()
1165
1166         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1167         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1168         # kw_level.add(j_themes, Should)
1169         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1170         # j_con = self.make_term_query(tokens, joined=True)
1171         # kw_level.add(j_con, Should)
1172
1173         # top_level.add(BooleanClause(phrase_level, Should))
1174         # top_level.add(BooleanClause(kw_level, Should))
1175
1176         # return None
1177
1178     def get_snippets(self, scoreDoc, query, field='content'):
1179         """
1180         Returns a snippet for found scoreDoc.
1181         """
1182         htmlFormatter = SimpleHTMLFormatter()
1183         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1184
1185         stored = self.searcher.doc(scoreDoc.doc)
1186
1187         position = stored.get('snippets_position')
1188         length = stored.get('snippets_length')
1189         if position is None or length is None:
1190             return None
1191         # locate content.
1192         snippets = Snippets(stored.get('book_id')).open()
1193         try:
1194             text = snippets.get((int(position),
1195                                  int(length)))
1196         finally:
1197             snippets.close()
1198
1199         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1200         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1201         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1202
1203         return snip
1204
1205     @staticmethod
1206     def enum_to_array(enum):
1207         """
1208         Converts a lucene TermEnum to array of Terms, suitable for
1209         addition to queries
1210         """
1211         terms = []
1212
1213         while True:
1214             t = enum.term()
1215             if t:
1216                 terms.append(t)
1217             if not enum.next(): break
1218
1219         if terms:
1220             return JArray('object')(terms, Term)
1221
1222     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1223         """
1224         Search for Tag objects using query.
1225         """
1226         if not pdcounter:
1227             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1228         tops = self.searcher.search(query, filters, max_results)
1229
1230         tags = []
1231         for found in tops.scoreDocs:
1232             doc = self.searcher.doc(found.doc)
1233             is_pdcounter = doc.get('is_pdcounter')
1234             if is_pdcounter:
1235                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1236             else:
1237                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1238                 # don't add the pdcounter tag if same tag already exists
1239             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1240                 tags.append(tag)
1241                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1242         print 'returning %s' % tags
1243         return tags
1244
1245     def search_books(self, query, filter=None, max_results=10):
1246         """
1247         Searches for Book objects using query
1248         """
1249         bks = []
1250         tops = self.searcher.search(query, filter, max_results)
1251         for found in tops.scoreDocs:
1252             doc = self.searcher.doc(found.doc)
1253             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1254         return bks
1255
1256     def make_prefix_phrase(self, toks, field):
1257         q = MultiPhraseQuery()
1258         for i in range(len(toks)):
1259             t = Term(field, toks[i])
1260             if i == len(toks) - 1:
1261                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1262                 if pterms:
1263                     q.add(pterms)
1264                 else:
1265                     q.add(t)
1266             else:
1267                 q.add(t)
1268         return q
1269
1270     @staticmethod
1271     def term_filter(term, inverse=False):
1272         only_term = TermsFilter()
1273         only_term.addTerm(term)
1274
1275         if inverse:
1276             neg = BooleanFilter()
1277             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1278             only_term = neg
1279
1280         return only_term
1281
1282     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1283         """
1284         Return auto-complete hints for tags
1285         using prefix search.
1286         """
1287         toks = self.get_tokens(string, field='SIMPLE')
1288         top = BooleanQuery()
1289
1290         for field in ['tag_name', 'tag_name_pl']:
1291             if prefix:
1292                 q = self.make_prefix_phrase(toks, field)
1293             else:
1294                 q = self.make_term_query(toks, field)
1295             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1296
1297         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1298
1299         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1300
1301     def hint_books(self, string, max_results=50, prefix=True):
1302         """
1303         Returns auto-complete hints for book titles
1304         Because we do not index 'pseudo' title-tags.
1305         Prefix search.
1306         """
1307         toks = self.get_tokens(string, field='SIMPLE')
1308
1309         if prefix:
1310             q = self.make_prefix_phrase(toks, 'title')
1311         else:
1312             q = self.make_term_query(toks, 'title')
1313
1314         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1315
1316     @staticmethod
1317     def chain_filters(filters, op=ChainedFilter.AND):
1318         """
1319         Chains a filter list together
1320         """
1321         filters = filter(lambda x: x is not None, filters)
1322         if not filters or filters is []:
1323             return None
1324         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1325         return chf
1326
1327     def filtered_categories(self, tags):
1328         """
1329         Return a list of tag categories, present in tags list.
1330         """
1331         cats = {}
1332         for t in tags:
1333             cats[t.category] = True
1334         return cats.keys()
1335
1336     def hint(self):
1337         return Hint(self)