apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, analyzer=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         self.index = IndexWriter(self.store, self.analyzer,\
 176                                  IndexWriter.MaxFieldLength.LIMITED)
 177         return self.index
 178
 179     def optimize(self):
 180         self.index.optimize()
 181
 182     def close(self):
 183         try:
 184             self.index.optimize()
 185         except JavaError, je:
 186             print "Error during optimize phase, check index: %s" % je
 187
 188         self.index.close()
 189         self.index = None
 190
 191     def __enter__(self):
 192         self.open()
 193         return self
 194
 195     def __exit__(self, type, value, tb):
 196         self.close()
 197
 198
 199 class Index(BaseIndex):
 200     """
 201     Class indexing books.
 202     """
 203     def __init__(self, analyzer=None):
 204         super(Index, self).__init__(analyzer)
 205
 206     def index_tags(self):
 207         """
 208         Re-index global tag list.
 209         Removes all tags from index, then index them again.
 210         Indexed fields include: id, name (with and without polish stems), category
 211         """
 212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 213         self.index.deleteDocuments(q)
 214
 215         for tag in catalogue.models.Tag.objects.all():
 216             doc = Document()
 217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 221             self.index.addDocument(doc)
 222
 223         for pdtag in PDCounterAuthor.objects.all():
 224             doc = Document()
 225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
 229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 230             self.index.addDocument(doc)
 231
 232     def create_book_doc(self, book):
 233         """
 234         Create a lucene document referring book id.
 235         """
 236         doc = Document()
 237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 238         if book.parent is not None:
 239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 240         return doc
 241
 242     def remove_book(self, book):
 243         """Removes a book from search index.
 244         book - Book instance."""
 245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 246         self.index.deleteDocuments(q)
 247
 248     def index_book(self, book, book_info=None, overwrite=True):
 249         """
 250         Indexes the book.
 251         Creates a lucene document for extracted metadata
 252         and calls self.index_content() to index the contents of the book.
 253         """
 254         if overwrite:
 255             self.remove_book(book)
 256
 257         book_doc = self.create_book_doc(book)
 258         meta_fields = self.extract_metadata(book, book_info)
 259         for f in meta_fields.values():
 260             if isinstance(f, list) or isinstance(f, tuple):
 261                 for elem in f:
 262                     book_doc.add(elem)
 263             else:
 264                 book_doc.add(f)
 265
 266         self.index.addDocument(book_doc)
 267         del book_doc
 268
 269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 270
 271     master_tags = [
 272         'opowiadanie',
 273         'powiesc',
 274         'dramat_wierszowany_l',
 275         'dramat_wierszowany_lp',
 276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 277         'wywiad',
 278         ]
 279
 280     ignore_content_tags = [
 281         'uwaga', 'extra',
 282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 283         'didaskalia',
 284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 285         ]
 286
 287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 288
 289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 290
 291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 292
 293     def extract_metadata(self, book, book_info=None):
 294         """
 295         Extract metadata from book and returns a map of fields keyed by fieldname
 296         """
 297         fields = {}
 298
 299         if book_info is None:
 300             book_info = dcparser.parse(open(book.xml_file.path))
 301
 302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 305
 306         # validator, name
 307         for field in dcparser.BookInfo.FIELDS:
 308             if hasattr(book_info, field.name):
 309                 if not getattr(book_info, field.name):
 310                     continue
 311                 # since no type information is available, we use validator
 312                 type_indicator = field.validator
 313                 if type_indicator == dcparser.as_unicode:
 314                     s = getattr(book_info, field.name)
 315                     if field.multiple:
 316                         s = ', '.join(s)
 317                     try:
 318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 319                     except JavaError as je:
 320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 321                 elif type_indicator == dcparser.as_person:
 322                     p = getattr(book_info, field.name)
 323                     if isinstance(p, dcparser.Person):
 324                         persons = unicode(p)
 325                     else:
 326                         persons = ', '.join(map(unicode, p))
 327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 328                 elif type_indicator == dcparser.as_date:
 329                     dt = getattr(book_info, field.name)
 330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 332
 333         # get published date
 334         source = book_info.source_name
 335         if hasattr(book_info, 'source_name'):
 336             match = self.published_date_re.search(source)
 337             if match is not None:
 338                 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
 339
 340         return fields
 341
 342     def add_gaps(self, fields, fieldname):
 343         """
 344         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 345         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 346         """
 347         def gap():
 348             while True:
 349                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 350         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 351
 352     def get_master(self, root):
 353         """
 354         Returns the first master tag from an etree.
 355         """
 356         for master in root.iter():
 357             if master.tag in self.master_tags:
 358                 return master
 359
 360     def index_content(self, book, book_fields=[]):
 361         """
 362         Walks the book XML and extract content from it.
 363         Adds parts for each header tag and for each fragment.
 364         """
 365         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 366         root = wld.edoc.getroot()
 367
 368         master = self.get_master(root)
 369         if master is None:
 370             return []
 371
 372         def walker(node, ignore_tags=[]):
 373             yield node, None
 374             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
 375                 for b, e in walker(child):
 376                     yield b, e
 377             yield None, node
 378             return
 379
 380         def fix_format(text):
 381             #            separator = [u" ", u"\t", u".", u";", u","]
 382             if isinstance(text, list):
 383                 # need to join it first
 384                 text = filter(lambda s: s is not None, content)
 385                 text = u' '.join(text)
 386                 # for i in range(len(text)):
 387                 #     if i > 0:
 388                 #         if text[i][0] not in separator\
 389                 #             and text[i - 1][-1] not in separator:
 390                 #          text.insert(i, u" ")
 391
 392             return re.sub("(?m)/$", "", text)
 393
 394         def add_part(snippets, **fields):
 395             doc = self.create_book_doc(book)
 396             for f in book_fields:
 397                 doc.add(f)
 398
 399             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 400             doc.add(NumericField("header_span", Field.Store.YES, True)\
 401                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 402             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 403
 404             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 405                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 406
 407             snip_pos = snippets.add(fields["content"])
 408             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 409             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 410
 411             if 'fragment_anchor' in fields:
 412                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 413                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 414
 415             if 'themes' in fields:
 416                 themes, themes_pl = zip(*[
 417                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 418                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 419                      for theme in fields['themes']])
 420
 421                 themes = self.add_gaps(themes, 'themes')
 422                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 423
 424                 for t in themes:
 425                     doc.add(t)
 426                 for t in themes_pl:
 427                     doc.add(t)
 428
 429             return doc
 430
 431         def give_me_utf8(s):
 432             if isinstance(s, unicode):
 433                 return s.encode('utf-8')
 434             else:
 435                 return s
 436
 437         fragments = {}
 438         snippets = Snippets(book.id).open('w')
 439         try:
 440             for header, position in zip(list(master), range(len(master))):
 441
 442                 if header.tag in self.skip_header_tags:
 443                     continue
 444                 if header.tag is etree.Comment:
 445                     continue
 446
 447                 # section content
 448                 content = []
 449                 footnote = None
 450
 451                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
 452                     # handle footnotes
 453                     # if start is not None and start.tag in self.footnote_tags:
 454                     #     footnote = ' '.join(start.itertext())
 455                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
 456                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
 457                     #                    content=footnote)
 458
 459                     #     self.index.addDocument(doc)
 460
 461                     #     footnote = None
 462
 463                     # handle fragments and themes.
 464                     if start is not None and start.tag == 'begin':
 465                         fid = start.attrib['id'][1:]
 466                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 467
 468                     elif start is not None and start.tag == 'motyw':
 469                         fid = start.attrib['id'][1:]
 470                         if start.text is not None:
 471                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 472
 473                     elif start is not None and start.tag == 'end':
 474                         fid = start.attrib['id'][1:]
 475                         if fid not in fragments:
 476                             continue  # a broken <end> node, skip it
 477                                       #                        import pdb; pdb.set_trace()
 478                         frag = fragments[fid]
 479                         if frag['themes'] == []:
 480                             continue  # empty themes list.
 481                         del fragments[fid]
 482
 483                         doc = add_part(snippets,
 484                                        header_type=frag['start_header'],
 485                                        header_index=frag['start_section'],
 486                                        header_span=position - frag['start_section'] + 1,
 487                                        fragment_anchor=fid,
 488                                        content=fix_format(frag['content']),
 489                                        themes=frag['themes'])
 490
 491                         self.index.addDocument(doc)
 492
 493                         # Collect content.
 494                     elif start is not None:
 495                         for frag in fragments.values():
 496                             frag['content'].append(start.text)
 497                         content.append(start.text)
 498                     elif end is not None:
 499                         for frag in fragments.values():
 500                             frag['content'].append(end.tail)
 501                         content.append(end.tail)
 502
 503                         # in the end, add a section text.
 504                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 505                                content=fix_format(content))
 506
 507                 self.index.addDocument(doc)
 508
 509         finally:
 510             snippets.close()
 511
 512
 513 def log_exception_wrapper(f):
 514     def _wrap(*a):
 515         try:
 516             f(*a)
 517         except Exception, e:
 518             print("Error in indexing thread: %s" % e)
 519             traceback.print_exc()
 520             raise e
 521     return _wrap
 522
 523
 524 class ReusableIndex(Index):
 525     """
 526     Works like index, but does not close/optimize Lucene index
 527     until program exit (uses atexit hook).
 528     This is usefull for importbooks command.
 529
 530     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 531     """
 532     index = None
 533
 534     def open(self, analyzer=None, threads=4):
 535         if ReusableIndex.index is not None:
 536             self.index = ReusableIndex.index
 537         else:
 538             print("opening index")
 539             Index.open(self, analyzer)
 540             ReusableIndex.index = self.index
 541             atexit.register(ReusableIndex.close_reusable)
 542
 543     # def index_book(self, *args, **kw):
 544     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 545     #     ReusableIndex.pool_jobs.append(job)
 546
 547     @staticmethod
 548     def close_reusable():
 549         if ReusableIndex.index is not None:
 550             ReusableIndex.index.optimize()
 551             ReusableIndex.index.close()
 552             ReusableIndex.index = None
 553
 554     def close(self):
 555         pass
 556
 557
 558 class JoinSearch(object):
 559     """
 560     This mixin could be used to handle block join queries.
 561     (currently unused)
 562     """
 563     def __init__(self, *args, **kw):
 564         super(JoinSearch, self).__init__(*args, **kw)
 565
 566     def wrapjoins(self, query, fields=[]):
 567         """
 568         This functions modifies the query in a recursive way,
 569         so Term and Phrase Queries contained, which match
 570         provided fields are wrapped in a BlockJoinQuery,
 571         and so delegated to children documents.
 572         """
 573         if BooleanQuery.instance_(query):
 574             qs = BooleanQuery.cast_(query)
 575             for clause in qs:
 576                 clause = BooleanClause.cast_(clause)
 577                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 578             return qs
 579         else:
 580             termset = HashSet()
 581             query.extractTerms(termset)
 582             for t in termset:
 583                 t = Term.cast_(t)
 584                 if t.field() not in fields:
 585                     return query
 586             return BlockJoinQuery(query, self.parent_filter,
 587                                   BlockJoinQuery.ScoreMode.Total)
 588
 589     def bsearch(self, query, max_results=50):
 590         q = self.query(query)
 591         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 592
 593         tops = self.searcher.search(bjq, max_results)
 594         bks = []
 595         for found in tops.scoreDocs:
 596             doc = self.searcher.doc(found.doc)
 597             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 598         return (bks, tops.totalHits)
 599
 600
 601 class SearchResult(object):
 602     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 603         if tokens_cache is None: tokens_cache = {}
 604
 605         if score:
 606             self._score = score
 607         else:
 608             self._score = scoreDocs.score
 609
 610         self.boost = 1.0
 611
 612         self._hits = []
 613         self._processed_hits = None  # processed hits
 614
 615         stored = search.searcher.doc(scoreDocs.doc)
 616         self.book_id = int(stored.get("book_id"))
 617
 618         pd = stored.get("published_date")
 619         if pd is None:
 620             pd = 0
 621         self.published_date = int(pd)
 622
 623         header_type = stored.get("header_type")
 624         # we have a content hit in some header of fragment
 625         if header_type is not None:
 626             sec = (header_type, int(stored.get("header_index")))
 627             header_span = stored.get('header_span')
 628             header_span = header_span is not None and int(header_span) or 1
 629
 630             fragment = stored.get("fragment_anchor")
 631
 632             if snippets:
 633                 snippets = snippets.replace("/\n", "\n")
 634             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 635
 636             self._hits.append(hit)
 637
 638         self.search = search
 639         self.searched = searched
 640         self.tokens_cache = tokens_cache
 641
 642     @property
 643     def score(self):
 644         return self._score * self.boost
 645
 646     def merge(self, other):
 647         if self.book_id != other.book_id:
 648             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 649         self._hits += other._hits
 650         if other.score > self.score:
 651             self._score = other._score
 652         return self
 653
 654     def get_book(self):
 655         return catalogue.models.Book.objects.get(id=self.book_id)
 656
 657     book = property(get_book)
 658
 659     @property
 660     def hits(self):
 661         if self._processed_hits is not None:
 662             return self._processed_hits
 663
 664         POSITION = 0
 665         FRAGMENT = 1
 666         POSITION_INDEX = 1
 667         POSITION_SPAN = 2
 668         SCORE = 2
 669         OTHER = 3
 670
 671         # to sections and fragments
 672         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 673         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 674         sect = filter(lambda s: 0 == len(filter(
 675             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 676             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 677             frags)), sect)
 678
 679         hits = []
 680
 681         # remove duplicate fragments
 682         fragments = {}
 683         for f in frags:
 684             fid = f[FRAGMENT]
 685             if fid in fragments:
 686                 if fragments[fid][SCORE] >= f[SCORE]:
 687                     continue
 688             fragments[fid] = f
 689         frags = fragments.values()
 690
 691         # remove duplicate sections
 692         sections = {}
 693
 694         for s in sect:
 695             si = s[POSITION][POSITION_INDEX]
 696             # skip existing
 697             if si in sections:
 698                 if sections[si]['score'] >= s[SCORE]:
 699                     continue
 700
 701             m = {'score': s[SCORE],
 702                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 703                  }
 704             m.update(s[OTHER])
 705             sections[si] = m
 706
 707         hits = sections.values()
 708
 709         for f in frags:
 710             try:
 711                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 712             except catalogue.models.Fragment.DoesNotExist:
 713                 # stale index
 714                 continue
 715
 716             # Figure out if we were searching for a token matching some word in theme name.
 717             themes = frag.tags.filter(category='theme')
 718             themes_hit = []
 719             if self.searched is not None:
 720                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 721                 for theme in themes:
 722                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 723                     for t in tokens:
 724                         if t in name_tokens:
 725                             if not theme in themes_hit:
 726                                 themes_hit.append(theme)
 727                             break
 728
 729             m = {'score': f[SCORE],
 730                  'fragment': frag,
 731                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 732                  'themes': themes,
 733                  'themes_hit': themes_hit
 734                  }
 735             m.update(f[OTHER])
 736             hits.append(m)
 737
 738         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 739
 740         self._processed_hits = hits
 741
 742         return hits
 743
 744     def __unicode__(self):
 745         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 746
 747     @staticmethod
 748     def aggregate(*result_lists):
 749         books = {}
 750         for rl in result_lists:
 751             for r in rl:
 752                 if r.book_id in books:
 753                     books[r.book_id].merge(r)
 754                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 755                 else:
 756                     books[r.book_id] = r
 757         return books.values()
 758
 759     def __cmp__(self, other):
 760         c = cmp(self.score, other.score)
 761         if c == 0:
 762             # this is inverted, because earlier date is better
 763             return cmp(other.published_date, self.published_date)
 764         else:
 765             return c
 766
 767
 768 class Hint(object):
 769     """
 770     Given some hint information (information we already know about)
 771     our search target - like author, title (specific book), epoch, genre, kind
 772     we can narrow down search using filters.
 773     """
 774     def __init__(self, search):
 775         """
 776         Accepts a Searcher instance.
 777         """
 778         self.search = search
 779         self.book_tags = {}
 780         self.part_tags = []
 781         self._books = []
 782
 783     def books(self, *books):
 784         """
 785         Give a hint that we search these books.
 786         """
 787         self._books = books
 788
 789     def tags(self, tags):
 790         """
 791         Give a hint that these Tag objects (a list of)
 792         is necessary.
 793         """
 794         for t in tags:
 795             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 796                 lst = self.book_tags.get(t.category, [])
 797                 lst.append(t)
 798                 self.book_tags[t.category] = lst
 799             if t.category in ['theme', 'theme_pl']:
 800                 self.part_tags.append(t)
 801
 802     def tag_filter(self, tags, field='tags'):
 803         """
 804         Given a lsit of tags and an optional field (but they are normally in tags field)
 805         returns a filter accepting only books with specific tags.
 806         """
 807         q = BooleanQuery()
 808
 809         for tag in tags:
 810             toks = self.search.get_tokens(tag.name, field=field)
 811             tag_phrase = PhraseQuery()
 812             for tok in toks:
 813                 tag_phrase.add(Term(field, tok))
 814             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 815
 816         return QueryWrapperFilter(q)
 817
 818     def book_filter(self):
 819         """
 820         Filters using book tags (all tag kinds except a theme)
 821         """
 822         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 823         if tags:
 824             return self.tag_filter(tags)
 825         else:
 826             return None
 827
 828     def part_filter(self):
 829         """
 830         This filter can be used to look for book parts.
 831         It filters on book id and/or themes.
 832         """
 833         fs = []
 834         if self.part_tags:
 835             fs.append(self.tag_filter(self.part_tags, field='themes'))
 836
 837         if self._books != []:
 838             bf = BooleanFilter()
 839             for b in self._books:
 840                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 841                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 842             fs.append(bf)
 843
 844         return Search.chain_filters(fs)
 845
 846     def should_search_for_book(self):
 847         return self._books == []
 848
 849     def just_search_in(self, all):
 850         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 851         some = []
 852         for field in all:
 853             if field == 'authors' and 'author' in self.book_tags:
 854                 continue
 855             if field == 'title' and self._books != []:
 856                 continue
 857             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 858                 continue
 859             some.append(field)
 860         return some
 861
 862
 863 class Search(IndexStore):
 864     """
 865     Search facilities.
 866     """
 867     def __init__(self, default_field="content"):
 868         IndexStore.__init__(self)
 869         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 870         # self.analyzer = WLAnalyzer()
 871         self.searcher = IndexSearcher(self.store, True)
 872         self.parser = QueryParser(Version.LUCENE_34, default_field,
 873                                   self.analyzer)
 874
 875         self.parent_filter = TermsFilter()
 876         self.parent_filter.addTerm(Term("is_book", "true"))
 877
 878     def query(self, query):
 879         """Parse query in default Lucene Syntax. (for humans)
 880         """
 881         return self.parser.parse(query)
 882
 883     def simple_search(self, query, max_results=50):
 884         """Runs a query for books using lucene syntax. (for humans)
 885         Returns (books, total_hits)
 886         """
 887
 888         tops = self.searcher.search(self.query(query), max_results)
 889         bks = []
 890         for found in tops.scoreDocs:
 891             doc = self.searcher.doc(found.doc)
 892             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 893         return (bks, tops.totalHits)
 894
 895     def get_tokens(self, searched, field='content', cached=None):
 896         """returns tokens analyzed by a proper (for a field) analyzer
 897         argument can be: StringReader, string/unicode, or tokens. In the last case
 898         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 899         """
 900         if cached is not None and field in cached:
 901             return cached[field]
 902
 903         if isinstance(searched, str) or isinstance(searched, unicode):
 904             searched = StringReader(searched)
 905         elif isinstance(searched, list):
 906             return searched
 907
 908         searched.reset()
 909         tokens = self.analyzer.reusableTokenStream(field, searched)
 910         toks = []
 911         while tokens.incrementToken():
 912             cta = tokens.getAttribute(CharTermAttribute.class_)
 913             toks.append(cta.toString())
 914
 915         if cached is not None:
 916             cached[field] = toks
 917
 918         return toks
 919
 920     def fuzziness(self, fuzzy):
 921         """Helper method to sanitize fuzziness"""
 922         if not fuzzy:
 923             return None
 924         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 925             return fuzzy
 926         else:
 927             return 0.5
 928
 929     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 930         """
 931         Return a PhraseQuery with a series of tokens.
 932         """
 933         if fuzzy:
 934             phrase = MultiPhraseQuery()
 935             for t in tokens:
 936                 term = Term(field, t)
 937                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 938                 fuzzterms = []
 939
 940                 while True:
 941                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 942                     ft = fuzzterm.term()
 943                     if ft:
 944                         fuzzterms.append(ft)
 945                     if not fuzzterm.next(): break
 946                 if fuzzterms:
 947                     phrase.add(JArray('object')(fuzzterms, Term))
 948                 else:
 949                     phrase.add(term)
 950         else:
 951             phrase = PhraseQuery()
 952             phrase.setSlop(slop)
 953             for t in tokens:
 954                 term = Term(field, t)
 955                 phrase.add(term)
 956         return phrase
 957
 958     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 959         """
 960         Returns term queries joined by boolean query.
 961         modal - applies to boolean query
 962         fuzzy - should the query by fuzzy.
 963         """
 964         q = BooleanQuery()
 965         for t in tokens:
 966             term = Term(field, t)
 967             if fuzzy:
 968                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 969             else:
 970                 term = TermQuery(term)
 971             q.add(BooleanClause(term, modal))
 972         return q
 973
 974     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 975                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 976         if filters is None: filters = []
 977         if tokens_cache is None: tokens_cache = {}
 978
 979         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 980
 981         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 982         if book:
 983             filters.append(self.term_filter(Term('is_book', 'true')))
 984         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 985
 986         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 987
 988     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 989                     filters=None, tokens_cache=None, boost=None, snippets=True):
 990         if filters is None: filters = []
 991         if tokens_cache is None: tokens_cache = {}
 992
 993         if book:
 994             filters.append(self.term_filter(Term('is_book', 'true')))
 995
 996         query = BooleanQuery()
 997
 998         for fld in fields:
 999             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1000
1001             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1002                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1003
1004         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1005
1006         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1007                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1008
1009     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1010         """
1011         Search for perfect book matches. Just see if the query matches with some author or title,
1012         taking hints into account.
1013         """
1014         fields_to_search = ['authors', 'title']
1015         only_in = None
1016         if hint:
1017             if not hint.should_search_for_book():
1018                 return []
1019             fields_to_search = hint.just_search_in(fields_to_search)
1020             only_in = hint.book_filter()
1021
1022         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1023
1024         books = []
1025         for q in qrys:
1026             top = self.searcher.search(q,
1027                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1028                 max_results)
1029             for found in top.scoreDocs:
1030                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1031         return books
1032
1033     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1034         fields_to_search = ['tags', 'authors', 'title']
1035
1036         only_in = None
1037         if hint:
1038             if not hint.should_search_for_book():
1039                 return []
1040             fields_to_search = hint.just_search_in(fields_to_search)
1041             only_in = hint.book_filter()
1042
1043         tokens = self.get_tokens(searched, field='SIMPLE')
1044
1045         q = BooleanQuery()
1046
1047         for fld in fields_to_search:
1048             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1049                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1050
1051         books = []
1052         top = self.searcher.search(q,
1053                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1054             max_results)
1055         for found in top.scoreDocs:
1056             books.append(SearchResult(self, found, how_found="search_book"))
1057
1058         return books
1059
1060     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1061         """
1062         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1063         some part/fragment of the book.
1064         """
1065         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1066
1067         flt = None
1068         if hint:
1069             flt = hint.part_filter()
1070
1071         books = []
1072         for q in qrys:
1073             top = self.searcher.search(q,
1074                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1075                                                            flt]),
1076                                        max_results)
1077             for found in top.scoreDocs:
1078                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1079
1080         return books
1081
1082     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1083         """
1084         Tries to use search terms to match different fields of book (or its parts).
1085         E.g. one word can be an author survey, another be a part of the title, and the rest
1086         are some words from third chapter.
1087         """
1088         if tokens_cache is None: tokens_cache = {}
1089         books = []
1090         only_in = None
1091
1092         if hint:
1093             only_in = hint.part_filter()
1094
1095         # content only query : themes x content
1096         q = BooleanQuery()
1097
1098         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1099         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1100
1101         # only search in themes when we do not already filter by themes
1102         if hint is None or hint.just_search_in(['themes']) != []:
1103             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1104                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1105
1106         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1107                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1108
1109         topDocs = self.searcher.search(q, only_in, max_results)
1110         for found in topDocs.scoreDocs:
1111             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1112             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1113
1114         # query themes/content x author/title/tags
1115         q = BooleanQuery()
1116         in_content = BooleanQuery()
1117         in_meta = BooleanQuery()
1118
1119         for fld in ['themes_pl', 'content']:
1120             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1121
1122         for fld in ['tags', 'authors', 'title']:
1123             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1124
1125         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1126         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1127
1128         topDocs = self.searcher.search(q, only_in, max_results)
1129         for found in topDocs.scoreDocs:
1130             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1131             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1132
1133         return books
1134
1135     # def multisearch(self, query, max_results=50):
1136     #     """
1137     #     Search strategy:
1138     #     - (phrase) OR -> content
1139     #                   -> title
1140     #                   -> authors
1141     #     - (keywords)  -> authors
1142     #                   -> motyw
1143     #                   -> tags
1144     #                   -> content
1145     #     """
1146         # queryreader = StringReader(query)
1147         # tokens = self.get_tokens(queryreader)
1148
1149         # top_level = BooleanQuery()
1150         # Should = BooleanClause.Occur.SHOULD
1151
1152         # phrase_level = BooleanQuery()
1153         # phrase_level.setBoost(1.3)
1154
1155         # p_content = self.make_phrase(tokens, joined=True)
1156         # p_title = self.make_phrase(tokens, 'title')
1157         # p_author = self.make_phrase(tokens, 'author')
1158
1159         # phrase_level.add(BooleanClause(p_content, Should))
1160         # phrase_level.add(BooleanClause(p_title, Should))
1161         # phrase_level.add(BooleanClause(p_author, Should))
1162
1163         # kw_level = BooleanQuery()
1164
1165         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1166         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1167         # kw_level.add(j_themes, Should)
1168         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1169         # j_con = self.make_term_query(tokens, joined=True)
1170         # kw_level.add(j_con, Should)
1171
1172         # top_level.add(BooleanClause(phrase_level, Should))
1173         # top_level.add(BooleanClause(kw_level, Should))
1174
1175         # return None
1176
1177     def get_snippets(self, scoreDoc, query, field='content'):
1178         """
1179         Returns a snippet for found scoreDoc.
1180         """
1181         htmlFormatter = SimpleHTMLFormatter()
1182         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1183
1184         stored = self.searcher.doc(scoreDoc.doc)
1185
1186         position = stored.get('snippets_position')
1187         length = stored.get('snippets_length')
1188         if position is None or length is None:
1189             return None
1190         # locate content.
1191         snippets = Snippets(stored.get('book_id')).open()
1192         try:
1193             text = snippets.get((int(position),
1194                                  int(length)))
1195         finally:
1196             snippets.close()
1197
1198         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1199         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1200         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1201
1202         return snip
1203
1204     @staticmethod
1205     def enum_to_array(enum):
1206         """
1207         Converts a lucene TermEnum to array of Terms, suitable for
1208         addition to queries
1209         """
1210         terms = []
1211
1212         while True:
1213             t = enum.term()
1214             if t:
1215                 terms.append(t)
1216             if not enum.next(): break
1217
1218         if terms:
1219             return JArray('object')(terms, Term)
1220
1221     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1222         """
1223         Search for Tag objects using query.
1224         """
1225         if not pdcounter:
1226             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1227         tops = self.searcher.search(query, filters, max_results)
1228
1229         tags = []
1230         for found in tops.scoreDocs:
1231             doc = self.searcher.doc(found.doc)
1232             is_pdcounter = doc.get('is_pdcounter')
1233             if is_pdcounter:
1234                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1235             else:
1236                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1237                 # don't add the pdcounter tag if same tag already exists
1238             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1239                 tags.append(tag)
1240                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1241         print 'returning %s' % tags
1242         return tags
1243
1244     def search_books(self, query, filter=None, max_results=10):
1245         """
1246         Searches for Book objects using query
1247         """
1248         bks = []
1249         tops = self.searcher.search(query, filter, max_results)
1250         for found in tops.scoreDocs:
1251             doc = self.searcher.doc(found.doc)
1252             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1253         return bks
1254
1255     def make_prefix_phrase(self, toks, field):
1256         q = MultiPhraseQuery()
1257         for i in range(len(toks)):
1258             t = Term(field, toks[i])
1259             if i == len(toks) - 1:
1260                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1261                 if pterms:
1262                     q.add(pterms)
1263                 else:
1264                     q.add(t)
1265             else:
1266                 q.add(t)
1267         return q
1268
1269     @staticmethod
1270     def term_filter(term, inverse=False):
1271         only_term = TermsFilter()
1272         only_term.addTerm(term)
1273
1274         if inverse:
1275             neg = BooleanFilter()
1276             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1277             only_term = neg
1278
1279         return only_term
1280
1281     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1282         """
1283         Return auto-complete hints for tags
1284         using prefix search.
1285         """
1286         toks = self.get_tokens(string, field='SIMPLE')
1287         top = BooleanQuery()
1288
1289         for field in ['tag_name', 'tag_name_pl']:
1290             if prefix:
1291                 q = self.make_prefix_phrase(toks, field)
1292             else:
1293                 q = self.make_term_query(toks, field)
1294             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1295
1296         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1297
1298         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1299
1300     def hint_books(self, string, max_results=50, prefix=True):
1301         """
1302         Returns auto-complete hints for book titles
1303         Because we do not index 'pseudo' title-tags.
1304         Prefix search.
1305         """
1306         toks = self.get_tokens(string, field='SIMPLE')
1307
1308         if prefix:
1309             q = self.make_prefix_phrase(toks, 'title')
1310         else:
1311             q = self.make_term_query(toks, 'title')
1312
1313         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1314
1315     @staticmethod
1316     def chain_filters(filters, op=ChainedFilter.AND):
1317         """
1318         Chains a filter list together
1319         """
1320         filters = filter(lambda x: x is not None, filters)
1321         if not filters or filters is []:
1322             return None
1323         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1324         return chf
1325
1326     def filtered_categories(self, tags):
1327         """
1328         Return a list of tag categories, present in tags list.
1329         """
1330         cats = {}
1331         for t in tags:
1332             cats[t.category] = True
1333         return cats.keys()
1334
1335     def hint(self):
1336         return Hint(self)