apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
   5     NumericField, Version, Document, JavaError, IndexSearcher, \
   6     QueryParser, PerFieldAnalyzerWrapper, \
   7     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   8     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
   9     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
  10     HashSet, BooleanClause, Term, CharTermAttribute, \
  11     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
  12     FuzzyQuery, FuzzyTermEnum, Sort, Integer, \
  13     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  14     initVM, CLASSPATH, JArray
  15     # KeywordAnalyzer
  16 JVM = initVM(CLASSPATH)
  17 import sys
  18 import os
  19 import errno
  20 from librarian import dcparser
  21 from librarian.parser import WLDocument
  22 import catalogue.models
  23 from multiprocessing.pool import ThreadPool
  24 from threading import current_thread
  25 import atexit
  26 import traceback
  27
  28
  29 class WLAnalyzer(PerFieldAnalyzerWrapper):
  30     def __init__(self):
  31         polish = PolishAnalyzer(Version.LUCENE_34)
  32         simple = SimpleAnalyzer(Version.LUCENE_34)
  33         keyword = KeywordAnalyzer(Version.LUCENE_34)
  34         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  35
  36         PerFieldAnalyzerWrapper.__init__(self, polish)
  37
  38         self.addAnalyzer("tags", simple)
  39         self.addAnalyzer("technical_editors", simple)
  40         self.addAnalyzer("editors", simple)
  41         self.addAnalyzer("url", keyword)
  42         self.addAnalyzer("source_url", keyword)
  43         self.addAnalyzer("source_name", simple)
  44         self.addAnalyzer("publisher", simple)
  45         self.addAnalyzer("author", simple)
  46         self.addAnalyzer("is_book", keyword)
  47
  48         self.addAnalyzer("KEYWORD", keyword)
  49         self.addAnalyzer("SIMPLE", simple)
  50         self.addAnalyzer("NATURAL", polish)
  51
  52
  53 class IndexStore(object):
  54     def __init__(self):
  55         self.make_index_dir()
  56         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  57
  58     def make_index_dir(self):
  59         try:
  60             os.makedirs(settings.SEARCH_INDEX)
  61         except OSError as exc:
  62             if exc.errno == errno.EEXIST:
  63                 pass
  64             else: raise
  65
  66
  67 class Index(IndexStore):
  68     def __init__(self, analyzer=None):
  69         IndexStore.__init__(self)
  70         self.index = None
  71         if not analyzer:
  72             analyzer = WLAnalyzer()
  73         self.analyzer = analyzer
  74
  75     def open(self, analyzer=None):
  76         if self.index:
  77             raise Exception("Index is already opened")
  78         self.index = IndexWriter(self.store, self.analyzer,\
  79                                  IndexWriter.MaxFieldLength.LIMITED)
  80         return self.index
  81
  82     def close(self):
  83         self.index.optimize()
  84         self.index.close()
  85         self.index = None
  86
  87     def remove_book(self, book):
  88         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
  89         self.index.deleteDocuments(q)
  90
  91     def index_book(self, book, overwrite=True):
  92         if overwrite:
  93             self.remove_book(book)
  94
  95         doc = self.extract_metadata(book)
  96         parts = self.extract_content(book)
  97         block = ArrayList().of_(Document)
  98
  99         for p in parts:
 100             block.add(p)
 101         block.add(doc)
 102         self.index.addDocuments(block)
 103
 104     master_tags = [
 105         'opowiadanie',
 106         'powiesc',
 107         'dramat_wierszowany_l',
 108         'dramat_wierszowany_lp',
 109         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 110         'wywiad'
 111         ]
 112
 113     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 114
 115     def create_book_doc(self, book):
 116         """
 117         Create a lucene document connected to the book
 118         """
 119         doc = Document()
 120         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 121         if book.parent is not None:
 122             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 123         return doc
 124
 125     def extract_metadata(self, book):
 126         book_info = dcparser.parse(book.xml_file)
 127
 128         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
 129
 130         doc = self.create_book_doc(book)
 131         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
 132         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
 133         doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 134
 135         # validator, name
 136         for field in dcparser.BookInfo.FIELDS:
 137             if hasattr(book_info, field.name):
 138                 if not getattr(book_info, field.name):
 139                     continue
 140                 # since no type information is available, we use validator
 141                 type_indicator = field.validator
 142                 if type_indicator == dcparser.as_unicode:
 143                     s = getattr(book_info, field.name)
 144                     if field.multiple:
 145                         s = ', '.join(s)
 146                     try:
 147                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
 148                     except JavaError as je:
 149                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 150                 elif type_indicator == dcparser.as_person:
 151                     p = getattr(book_info, field.name)
 152                     if isinstance(p, dcparser.Person):
 153                         persons = unicode(p)
 154                     else:
 155                         persons = ', '.join(map(unicode, p))
 156                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
 157                 elif type_indicator == dcparser.as_date:
 158                     dt = getattr(book_info, field.name)
 159                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
 160         return doc
 161
 162     def get_master(self, root):
 163         for master in root.iter():
 164             if master.tag in self.master_tags:
 165                 return master
 166
 167     def extract_content(self, book):
 168         wld = WLDocument.from_file(book.xml_file.path)
 169         root = wld.edoc.getroot()
 170
 171         # first we build a sequence of top-level items.
 172         # book_id
 173         # header_index - the 0-indexed position of header element.
 174         # content
 175         master = self.get_master(root)
 176         if master is None:
 177             return []
 178
 179         header_docs = []
 180         for header, position in zip(list(master), range(len(master))):
 181             if header.tag in self.skip_header_tags:
 182                 continue
 183             doc = self.create_book_doc(book)
 184             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
 185             doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
 186             content = u' '.join([t for t in header.itertext()])
 187             doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED))
 188             header_docs.append(doc)
 189
 190         def walker(node):
 191             yield node, None
 192             for child in list(node):
 193                 for b, e in walker(child):
 194                     yield b, e
 195             yield None, node
 196             return
 197
 198         # Then we create a document for each fragments
 199         # fragment_anchor - the anchor
 200         # themes - list of themes [not indexed]
 201         fragment_docs = []
 202         # will contain (framgent id -> { content: [], themes: [] }
 203         fragments = {}
 204         for start, end in walker(master):
 205             if start is not None and start.tag == 'begin':
 206                 fid = start.attrib['id'][1:]
 207                 fragments[fid] = {'content': [], 'themes': []}
 208                 fragments[fid]['content'].append(start.tail)
 209             elif start is not None and start.tag == 'motyw':
 210                 fid = start.attrib['id'][1:]
 211                 fragments[fid]['themes'].append(start.text)
 212                 fragments[fid]['content'].append(start.tail)
 213             elif start is not None and start.tag == 'end':
 214                 fid = start.attrib['id'][1:]
 215                 if fid not in fragments:
 216                     continue  # a broken <end> node, skip it
 217                 frag = fragments[fid]
 218                 del fragments[fid]
 219
 220                 def jstr(l):
 221                     return u' '.join(map(
 222                         lambda x: x == None and u'(none)' or unicode(x),
 223                         l))
 224
 225                 doc = self.create_book_doc(book)
 226                 doc.add(Field("fragment_anchor", fid,
 227                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 228                 doc.add(Field("content",
 229                               u' '.join(filter(lambda s: s is not None, frag['content'])),
 230                               Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
 231                 doc.add(Field("themes",
 232                               u' '.join(filter(lambda s: s is not None, frag['themes'])),
 233                               Field.Store.NO, Field.Index.ANALYZED))
 234
 235                 fragment_docs.append(doc)
 236             elif start is not None:
 237                 for frag in fragments.values():
 238                     frag['content'].append(start.text)
 239             elif end is not None:
 240                 for frag in fragments.values():
 241                     frag['content'].append(end.tail)
 242
 243         return header_docs + fragment_docs
 244
 245     def __enter__(self):
 246         self.open()
 247         return self
 248
 249     def __exit__(self, type, value, tb):
 250         self.close()
 251
 252
 253 def log_exception_wrapper(f):
 254     def _wrap(*a):
 255         try:
 256             f(*a)
 257         except Exception, e:
 258             print("Error in indexing thread: %s" % e)
 259             traceback.print_exc()
 260             raise e
 261     return _wrap
 262
 263
 264 class ReusableIndex(Index):
 265     """
 266     Works like index, but does not close/optimize Lucene index
 267     until program exit (uses atexit hook).
 268     This is usefull for importbooks command.
 269
 270     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 271     """
 272     index = None
 273     pool = None
 274     pool_jobs = None
 275
 276     def open(self, analyzer=None, threads=4):
 277         if ReusableIndex.index is not None:
 278             self.index = ReusableIndex.index
 279         else:
 280             print("opening index")
 281             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
 282             ReusableIndex.pool_jobs = []
 283             Index.open(self, analyzer)
 284             ReusableIndex.index = self.index
 285             atexit.register(ReusableIndex.close_reusable)
 286
 287     def index_book(self, *args, **kw):
 288         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 289         ReusableIndex.pool_jobs.append(job)
 290
 291     @staticmethod
 292     def close_reusable():
 293         if ReusableIndex.index is not None:
 294             print("wait for indexing to finish")
 295             for job in ReusableIndex.pool_jobs:
 296                 job.get()
 297                 sys.stdout.write('.')
 298                 sys.stdout.flush()
 299             print("done.")
 300             ReusableIndex.pool.close()
 301
 302             ReusableIndex.index.optimize()
 303             ReusableIndex.index.close()
 304             ReusableIndex.index = None
 305
 306     def close(self):
 307         pass
 308
 309
 310 class Search(IndexStore):
 311     def __init__(self, default_field="content"):
 312         IndexStore.__init__(self)
 313         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
 314         ## self.analyzer = WLAnalyzer()
 315         self.searcher = IndexSearcher(self.store, True)
 316         self.parser = QueryParser(Version.LUCENE_34, default_field,
 317                                   self.analyzer)
 318
 319         self.parent_filter = TermsFilter()
 320         self.parent_filter.addTerm(Term("is_book", "true"))
 321
 322     def query(self, query):
 323         return self.parser.parse(query)
 324
 325     def wrapjoins(self, query, fields=[]):
 326         """
 327         This functions modifies the query in a recursive way,
 328         so Term and Phrase Queries contained, which match
 329         provided fields are wrapped in a BlockJoinQuery,
 330         and so delegated to children documents.
 331         """
 332         if BooleanQuery.instance_(query):
 333             qs = BooleanQuery.cast_(query)
 334             for clause in qs:
 335                 clause = BooleanClause.cast_(clause)
 336                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 337             return qs
 338         else:
 339             termset = HashSet()
 340             query.extractTerms(termset)
 341             for t in termset:
 342                 t = Term.cast_(t)
 343                 if t.field() not in fields:
 344                     return query
 345             return BlockJoinQuery(query, self.parent_filter,
 346                                   BlockJoinQuery.ScoreMode.Total)
 347
 348     def simple_search(self, query, max_results=50):
 349         """Returns (books, total_hits)
 350         """
 351
 352         tops = self.searcher.search(self.query(query), max_results)
 353         bks = []
 354         for found in tops.scoreDocs:
 355             doc = self.searcher.doc(found.doc)
 356             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 357         return (bks, tops.totalHits)
 358
 359     def search(self, query, max_results=50):
 360         query = self.query(query)
 361         query = self.wrapjoins(query, ["content", "themes"])
 362
 363         tops = self.searcher.search(query, max_results)
 364         bks = []
 365         for found in tops.scoreDocs:
 366             doc = self.searcher.doc(found.doc)
 367             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 368         return (bks, tops.totalHits)
 369
 370     def bsearch(self, query, max_results=50):
 371         q = self.query(query)
 372         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 373
 374         tops = self.searcher.search(bjq, max_results)
 375         bks = []
 376         for found in tops.scoreDocs:
 377             doc = self.searcher.doc(found.doc)
 378             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 379         return (bks, tops.totalHits)
 380
 381 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
 382 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
 383 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
 384
 385 # while (tokenStream.incrementToken()) {
 386 #     int startOffset = offsetAttribute.startOffset();
 387 #     int endOffset = offsetAttribute.endOffset();
 388 #     String term = charTermAttribute.toString();
 389 # }
 390
 391
 392 class SearchResult(object):
 393     def __init__(self, searcher, scoreDocs, score=None, highlight_query=None):
 394         if score:
 395             self.score = score
 396         else:
 397             self.score = scoreDocs.score
 398
 399         self.fragments = []
 400         self.scores = {}
 401         self.sections = []
 402
 403         stored = searcher.doc(scoreDocs.doc)
 404         self.book_id = int(stored.get("book_id"))
 405
 406         fragment = stored.get("fragment_anchor")
 407         if fragment:
 408             self.fragments.append(fragment)
 409             self.scores[fragment] = scoreDocs.score
 410
 411         header_type = stored.get("header_type")
 412         if header_type:
 413             sec = (header_type, int(stored.get("header_index")))
 414             self.sections.append(sec)
 415             self.scores[sec] = scoreDocs.score
 416
 417         self.snippets = []
 418
 419     def add_snippets(self, snippets):
 420         self.snippets += snippets
 421         return self
 422
 423     def get_book(self):
 424         return catalogue.models.Book.objects.get(id=self.book_id)
 425
 426     book = property(get_book)
 427
 428     def get_parts(self):
 429         book = self.book
 430         parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
 431             + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
 432
 433         parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
 434         print("bookid: %d parts: %s" % (self.book_id, parts))
 435         return parts
 436
 437     parts = property(get_parts)
 438
 439     def merge(self, other):
 440         if self.book_id != other.book_id:
 441             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 442         self.fragments += other.fragments
 443         self.sections += other.sections
 444         self.snippets += other.snippets
 445         self.scores.update(other.scores)
 446         if other.score > self.score:
 447             self.score = other.score
 448         return self
 449
 450     def __unicode__(self):
 451         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 452
 453     @staticmethod
 454     def aggregate(*result_lists):
 455         books = {}
 456         for rl in result_lists:
 457             for r in rl:
 458                 if r.book_id in books:
 459                     books[r.book_id].merge(r)
 460                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 461                 else:
 462                     books[r.book_id] = r
 463         return books.values()
 464
 465     def __cmp__(self, other):
 466         return cmp(self.score, other.score)
 467
 468
 469 class MultiSearch(Search):
 470     """Class capable of IMDb-like searching"""
 471     def get_tokens(self, searched, field='content'):
 472         """returns tokens analyzed by a proper (for a field) analyzer
 473         argument can be: StringReader, string/unicode, or tokens. In the last case
 474         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 475         """
 476         if isinstance(searched, str) or isinstance(searched, unicode):
 477             searched = StringReader(searched)
 478         elif isinstance(searched, list):
 479             return searched
 480
 481         searched.reset()
 482         tokens = self.analyzer.reusableTokenStream(field, searched)
 483         toks = []
 484         while tokens.incrementToken():
 485             cta = tokens.getAttribute(CharTermAttribute.class_)
 486             toks.append(cta.toString())
 487         return toks
 488
 489     def fuzziness(self, fuzzy):
 490         if not fuzzy:
 491             return None
 492         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 493             return fuzzy
 494         else:
 495             return 0.5
 496
 497     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 498         if fuzzy:
 499             phrase = MultiPhraseQuery()
 500             for t in tokens:
 501                 term = Term(field, t)
 502                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 503                 fuzzterms = []
 504
 505                 while True:
 506                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 507                     ft = fuzzterm.term()
 508                     if ft:
 509                         fuzzterms.append(ft)
 510                     if not fuzzterm.next(): break
 511                 if fuzzterms:
 512                     phrase.add(JArray('object')(fuzzterms, Term))
 513                 else:
 514                     phrase.add(term)
 515         else:
 516             phrase = PhraseQuery()
 517             phrase.setSlop(slop)
 518             for t in tokens:
 519                 term = Term(field, t)
 520                 phrase.add(term)
 521         return phrase
 522
 523     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 524         q = BooleanQuery()
 525         for t in tokens:
 526             term = Term(field, t)
 527             if fuzzy:
 528                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 529             else:
 530                 term = TermQuery(term)
 531             q.add(BooleanClause(term, modal))
 532         return q
 533
 534     def content_query(self, query):
 535         return BlockJoinQuery(query, self.parent_filter,
 536                               BlockJoinQuery.ScoreMode.Total)
 537
 538     def search_perfect_book(self, searched, max_results=20, fuzzy=False):
 539         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in ['author', 'title']]
 540
 541         books = []
 542         for q in qrys:
 543             top = self.searcher.search(q, max_results)
 544             for found in top.scoreDocs:
 545                 books.append(SearchResult(self.searcher, found))
 546         return books
 547
 548     def search_perfect_parts(self, searched, max_results=20, fuzzy=False):
 549         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 550
 551         books = []
 552         for q in qrys:
 553             top = self.searcher.search(q, max_results)
 554             for found in top.scoreDocs:
 555                 books.append(SearchResult(self.searcher, found).add_snippets(self.get_snippets(found, q)))
 556
 557         return books
 558
 559     def search_everywhere(self, searched, max_results=20, fuzzy=False):
 560         books = []
 561
 562         # content only query : themes x content
 563         q = BooleanQuery()
 564
 565         tokens = self.get_tokens(searched)
 566         q.add(BooleanClause(self.make_term_query(tokens, field='themes', fuzzy=fuzzy), BooleanClause.Occur.MUST))
 567         q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 568
 569         topDocs = self.searcher.search(q, max_results)
 570         for found in topDocs.scoreDocs:
 571             books.append(SearchResult(self.searcher, found))
 572
 573         # joined query themes/content x author/title/epochs/genres/kinds
 574         q = BooleanQuery()
 575         in_meta = BooleanQuery()
 576         in_content = BooleanQuery()
 577
 578         for fld in ['themes', 'content']:
 579             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 580
 581         in_meta.add(BooleanClause(self.make_term_query(self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 582
 583         for fld in ['title', 'epochs', 'genres', 'kinds']:
 584             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 585
 586         q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
 587         in_content_join = self.content_query(in_content)
 588         q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
 589         #        import pdb; pdb.set_trace()
 590         collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
 591
 592         self.searcher.search(q, collector)
 593
 594         top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
 595         if top_groups:
 596             for grp in top_groups.groups:
 597                 for part in grp.scoreDocs:
 598                     books.append(SearchResult(self.searcher, part, score=grp.maxScore))
 599         return books
 600
 601     def multisearch(self, query, max_results=50):
 602         """
 603         Search strategy:
 604         - (phrase) OR -> content
 605                       -> title
 606                       -> author
 607         - (keywords)  -> author
 608                       -> motyw
 609                       -> tags
 610                       -> content
 611         """
 612         # queryreader = StringReader(query)
 613         # tokens = self.get_tokens(queryreader)
 614
 615         # top_level = BooleanQuery()
 616         # Should = BooleanClause.Occur.SHOULD
 617
 618         # phrase_level = BooleanQuery()
 619         # phrase_level.setBoost(1.3)
 620
 621         # p_content = self.make_phrase(tokens, joined=True)
 622         # p_title = self.make_phrase(tokens, 'title')
 623         # p_author = self.make_phrase(tokens, 'author')
 624
 625         # phrase_level.add(BooleanClause(p_content, Should))
 626         # phrase_level.add(BooleanClause(p_title, Should))
 627         # phrase_level.add(BooleanClause(p_author, Should))
 628
 629         # kw_level = BooleanQuery()
 630
 631         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
 632         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
 633         # kw_level.add(j_themes, Should)
 634         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 635         # j_con = self.make_term_query(tokens, joined=True)
 636         # kw_level.add(j_con, Should)
 637
 638         # top_level.add(BooleanClause(phrase_level, Should))
 639         # top_level.add(BooleanClause(kw_level, Should))
 640
 641         return None
 642
 643
 644     def do_search(self, query, max_results=50, collector=None):
 645         tops = self.searcher.search(query, max_results)
 646         #tops = self.searcher.search(p_content, max_results)
 647
 648         bks = []
 649         for found in tops.scoreDocs:
 650             doc = self.searcher.doc(found.doc)
 651             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
 652             bks.append(b)
 653             print "%s (%d) -> %f" % (b, b.id, found.score)
 654         return (bks, tops.totalHits)
 655
 656     def get_snippets(self, scoreDoc, query, field='content'):
 657         htmlFormatter = SimpleHTMLFormatter()
 658         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
 659
 660         stored = self.searcher.doc(scoreDoc.doc)
 661         text = stored.get(field)
 662         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
 663         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
 664         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 665         print('snips: %s' % snip)
 666
 667         return [snip]