apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
  13     FuzzyQuery, FuzzyTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     initVM, CLASSPATH, JArray, JavaError
  16     # KeywordAnalyzer
  17
  18 # Initialize jvm
  19 JVM = initVM(classpath=CLASSPATH, maxheap=str(400*1024*1024))
  20
  21 import sys
  22 import os
  23 import re
  24 import errno
  25 from librarian import dcparser
  26 from librarian.parser import WLDocument
  27 import catalogue.models
  28 from multiprocessing.pool import ThreadPool
  29 from threading import current_thread
  30 import atexit
  31 import traceback
  32
  33
  34 class WLAnalyzer(PerFieldAnalyzerWrapper):
  35     def __init__(self):
  36         polish = PolishAnalyzer(Version.LUCENE_34)
  37         simple = SimpleAnalyzer(Version.LUCENE_34)
  38         keyword = KeywordAnalyzer(Version.LUCENE_34)
  39         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  40
  41         PerFieldAnalyzerWrapper.__init__(self, polish)
  42
  43         self.addAnalyzer("tags", simple)
  44         self.addAnalyzer("technical_editors", simple)
  45         self.addAnalyzer("editors", simple)
  46         self.addAnalyzer("url", keyword)
  47         self.addAnalyzer("source_url", keyword)
  48         self.addAnalyzer("source_name", simple)
  49         self.addAnalyzer("publisher", simple)
  50         self.addAnalyzer("author", simple)
  51         self.addAnalyzer("is_book", keyword)
  52
  53         self.addAnalyzer("KEYWORD", keyword)
  54         self.addAnalyzer("SIMPLE", simple)
  55         self.addAnalyzer("POLISH", polish)
  56
  57
  58 class IndexStore(object):
  59     def __init__(self):
  60         self.make_index_dir()
  61         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  62
  63     def make_index_dir(self):
  64         try:
  65             os.makedirs(settings.SEARCH_INDEX)
  66         except OSError as exc:
  67             if exc.errno == errno.EEXIST:
  68                 pass
  69             else: raise
  70
  71
  72 class IndexChecker(IndexStore):
  73     def __init__(self):
  74         IndexStore.__init__(self)
  75
  76     def check(self):
  77         checker = CheckIndex(self.store)
  78         status = checker.checkIndex()
  79         return status
  80
  81
  82 class Snippets(object):
  83     SNIPPET_DIR = "snippets"
  84
  85     def __init__(self, book_id):
  86         try:
  87             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  88         except OSError as exc:
  89             if exc.errno == errno.EEXIST:
  90                 pass
  91             else: raise
  92         self.book_id = book_id
  93         self.file = None
  94
  95     def open(self, mode='r'):
  96         if not 'b' in mode:
  97             mode += 'b'
  98         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
  99         self.position = 0
 100         return self
 101
 102     def add(self, snippet):
 103         l = len(snippet)
 104         self.file.write(snippet.encode('utf-8'))
 105         pos = (self.position, l)
 106         self.position += l
 107         return pos
 108
 109     def get(self, pos):
 110         self.file.seek(pos[0], 0)
 111         return self.read(pos[1]).decode('utf-8')
 112
 113     def close(self):
 114         self.file.close()
 115
 116
 117 class Index(IndexStore):
 118     def __init__(self, analyzer=None):
 119         IndexStore.__init__(self)
 120         self.index = None
 121         if not analyzer:
 122             analyzer = WLAnalyzer()
 123         self.analyzer = analyzer
 124
 125     def open(self, analyzer=None):
 126         if self.index:
 127             raise Exception("Index is already opened")
 128         self.index = IndexWriter(self.store, self.analyzer,\
 129                                  IndexWriter.MaxFieldLength.LIMITED)
 130         return self.index
 131
 132     def optimize(self):
 133         self.index.optimize()
 134
 135     def close(self):
 136         try:
 137             self.index.optimize()
 138         except JavaError, je:
 139             print "Error during optimize phase, check index: %s" % je
 140
 141         self.index.close()
 142         self.index = None
 143
 144     def remove_book(self, book):
 145         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 146         self.index.deleteDocuments(q)
 147
 148     def index_book(self, book, overwrite=True):
 149         if overwrite:
 150             self.remove_book(book)
 151
 152         doc = self.extract_metadata(book)
 153         parts = self.extract_content(book)
 154         block = ArrayList().of_(Document)
 155
 156         print "adding block."
 157         for p in parts:
 158             block.add(p)
 159         block.add(doc)
 160         self.index.addDocuments(block)
 161         print "added."
 162
 163     master_tags = [
 164         'opowiadanie',
 165         'powiesc',
 166         'dramat_wierszowany_l',
 167         'dramat_wierszowany_lp',
 168         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 169         'wywiad'
 170         ]
 171
 172     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 173
 174     def create_book_doc(self, book):
 175         """
 176         Create a lucene document connected to the book
 177         """
 178         doc = Document()
 179         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 180         if book.parent is not None:
 181             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 182         return doc
 183
 184     def extract_metadata(self, book):
 185         book_info = dcparser.parse(book.xml_file)
 186
 187         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
 188
 189         doc = self.create_book_doc(book)
 190         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
 191         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
 192         doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 193
 194         # validator, name
 195         for field in dcparser.BookInfo.FIELDS:
 196             if hasattr(book_info, field.name):
 197                 if not getattr(book_info, field.name):
 198                     continue
 199                 # since no type information is available, we use validator
 200                 type_indicator = field.validator
 201                 if type_indicator == dcparser.as_unicode:
 202                     s = getattr(book_info, field.name)
 203                     if field.multiple:
 204                         s = ', '.join(s)
 205                     try:
 206                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
 207                     except JavaError as je:
 208                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 209                 elif type_indicator == dcparser.as_person:
 210                     p = getattr(book_info, field.name)
 211                     if isinstance(p, dcparser.Person):
 212                         persons = unicode(p)
 213                     else:
 214                         persons = ', '.join(map(unicode, p))
 215                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
 216                 elif type_indicator == dcparser.as_date:
 217                     dt = getattr(book_info, field.name)
 218                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
 219         return doc
 220
 221     def get_master(self, root):
 222         for master in root.iter():
 223             if master.tag in self.master_tags:
 224                 return master
 225
 226     def extract_content(self, book):
 227         wld = WLDocument.from_file(book.xml_file.path)
 228         root = wld.edoc.getroot()
 229
 230         master = self.get_master(root)
 231         if master is None:
 232             return []
 233
 234         def walker(node):
 235             yield node, None
 236             for child in list(node):
 237                 for b, e in walker(child):
 238                     yield b, e
 239             yield None, node
 240             return
 241
 242         def fix_format(text):
 243             return re.sub("/$", "", text, flags=re.M)
 244
 245         # header_type
 246         # header_index
 247         header_docs = []
 248         # Then we create a document for each fragments
 249         # fragment_anchor - the anchor
 250         # themes - list of themes [not indexed]
 251         fragment_docs = []
 252         # will contain (framgent id -> { content: [], themes: [] }
 253         fragments = {}
 254         snippets = Snippets(book.id).open('w')
 255         try:
 256             for header, position in zip(list(master), range(len(master))):
 257                 sys.stdout.write("\rsection: %d" % position)
 258
 259                 if header.tag in self.skip_header_tags:
 260                     continue
 261
 262                 doc = self.create_book_doc(book)
 263
 264                 doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
 265                 doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
 266
 267                 content = u' '.join([t for t in header.itertext()])
 268                 content = fix_format(content)
 269
 270                 doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
 271                 snip_pos = snippets.add(content)
 272                 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 273                 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[0]))
 274
 275                 header_docs.append(doc)
 276
 277                 for start, end in walker(master):
 278                     if start is not None and start.tag == 'begin':
 279                         fid = start.attrib['id'][1:]
 280                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 281                         fragments[fid]['content'].append(start.tail)
 282                     elif start is not None and start.tag == 'motyw':
 283                         fid = start.attrib['id'][1:]
 284                         fragments[fid]['themes'].append(start.text)
 285                         fragments[fid]['content'].append(start.tail)
 286                     elif start is not None and start.tag == 'end':
 287                         fid = start.attrib['id'][1:]
 288                         if fid not in fragments:
 289                             continue  # a broken <end> node, skip it
 290                         frag = fragments[fid]
 291                         del fragments[fid]
 292
 293                         def jstr(l):
 294                             return u' '.join(map(
 295                                 lambda x: x == None and u'(none)' or unicode(x),
 296                                 l))
 297
 298                         doc = self.create_book_doc(book)
 299
 300                         doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
 301                         doc.add(NumericField("header_span", Field.Store.YES, True).setIntValue(position - frag['start_section'] + 1))
 302                         doc.add(Field("header_type", frag['start_header'], Field.Store.YES, Field.Index.NOT_ANALYZED))
 303
 304                         doc.add(Field("fragment_anchor", fid,
 305                                       Field.Store.YES, Field.Index.NOT_ANALYZED))
 306                         doc.add(Field("content",
 307                                       u' '.join(filter(lambda s: s is not None, frag['content'])),
 308                                       Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
 309
 310                         snip_pos = snippets.add(content)
 311                         doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 312                         doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[0]))
 313
 314                         doc.add(Field("themes",
 315                                       u' '.join(filter(lambda s: s is not None, frag['themes'])),
 316                                       Field.Store.NO, Field.Index.ANALYZED))
 317
 318                         fragment_docs.append(doc)
 319                     elif start is not None:
 320                         for frag in fragments.values():
 321                             frag['content'].append(start.text)
 322                     elif end is not None:
 323                         for frag in fragments.values():
 324                             frag['content'].append(end.tail)
 325         finally:
 326             snippets.close()
 327
 328         return header_docs + fragment_docs
 329
 330     def __enter__(self):
 331         self.open()
 332         return self
 333
 334     def __exit__(self, type, value, tb):
 335         self.close()
 336
 337
 338 def log_exception_wrapper(f):
 339     def _wrap(*a):
 340         try:
 341             f(*a)
 342         except Exception, e:
 343             print("Error in indexing thread: %s" % e)
 344             traceback.print_exc()
 345             raise e
 346     return _wrap
 347
 348
 349 class ReusableIndex(Index):
 350     """
 351     Works like index, but does not close/optimize Lucene index
 352     until program exit (uses atexit hook).
 353     This is usefull for importbooks command.
 354
 355     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 356     """
 357     index = None
 358     pool = None
 359     pool_jobs = None
 360
 361     def open(self, analyzer=None, threads=4):
 362         if ReusableIndex.index is not None:
 363             self.index = ReusableIndex.index
 364         else:
 365             print("opening index")
 366             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
 367             ReusableIndex.pool_jobs = []
 368             Index.open(self, analyzer)
 369             ReusableIndex.index = self.index
 370             atexit.register(ReusableIndex.close_reusable)
 371
 372     def index_book(self, *args, **kw):
 373         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 374         ReusableIndex.pool_jobs.append(job)
 375
 376     @staticmethod
 377     def close_reusable():
 378         if ReusableIndex.index is not None:
 379             print("wait for indexing to finish")
 380             for job in ReusableIndex.pool_jobs:
 381                 job.get()
 382                 sys.stdout.write('.')
 383                 sys.stdout.flush()
 384             print("done.")
 385             ReusableIndex.pool.close()
 386
 387             ReusableIndex.index.optimize()
 388             ReusableIndex.index.close()
 389             ReusableIndex.index = None
 390
 391     def close(self):
 392         pass
 393
 394
 395 class Search(IndexStore):
 396     def __init__(self, default_field="content"):
 397         IndexStore.__init__(self)
 398         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
 399         ## self.analyzer = WLAnalyzer()
 400         self.searcher = IndexSearcher(self.store, True)
 401         self.parser = QueryParser(Version.LUCENE_34, default_field,
 402                                   self.analyzer)
 403
 404         self.parent_filter = TermsFilter()
 405         self.parent_filter.addTerm(Term("is_book", "true"))
 406
 407     def query(self, query):
 408         return self.parser.parse(query)
 409
 410     def wrapjoins(self, query, fields=[]):
 411         """
 412         This functions modifies the query in a recursive way,
 413         so Term and Phrase Queries contained, which match
 414         provided fields are wrapped in a BlockJoinQuery,
 415         and so delegated to children documents.
 416         """
 417         if BooleanQuery.instance_(query):
 418             qs = BooleanQuery.cast_(query)
 419             for clause in qs:
 420                 clause = BooleanClause.cast_(clause)
 421                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 422             return qs
 423         else:
 424             termset = HashSet()
 425             query.extractTerms(termset)
 426             for t in termset:
 427                 t = Term.cast_(t)
 428                 if t.field() not in fields:
 429                     return query
 430             return BlockJoinQuery(query, self.parent_filter,
 431                                   BlockJoinQuery.ScoreMode.Total)
 432
 433     def simple_search(self, query, max_results=50):
 434         """Returns (books, total_hits)
 435         """
 436
 437         tops = self.searcher.search(self.query(query), max_results)
 438         bks = []
 439         for found in tops.scoreDocs:
 440             doc = self.searcher.doc(found.doc)
 441             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 442         return (bks, tops.totalHits)
 443
 444     def search(self, query, max_results=50):
 445         query = self.query(query)
 446         query = self.wrapjoins(query, ["content", "themes"])
 447
 448         tops = self.searcher.search(query, max_results)
 449         bks = []
 450         for found in tops.scoreDocs:
 451             doc = self.searcher.doc(found.doc)
 452             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 453         return (bks, tops.totalHits)
 454
 455     def bsearch(self, query, max_results=50):
 456         q = self.query(query)
 457         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 458
 459         tops = self.searcher.search(bjq, max_results)
 460         bks = []
 461         for found in tops.scoreDocs:
 462             doc = self.searcher.doc(found.doc)
 463             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 464         return (bks, tops.totalHits)
 465
 466 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
 467 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
 468 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
 469
 470 # while (tokenStream.incrementToken()) {
 471 #     int startOffset = offsetAttribute.startOffset();
 472 #     int endOffset = offsetAttribute.endOffset();
 473 #     String term = charTermAttribute.toString();
 474 # }
 475
 476
 477 class SearchResult(object):
 478     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets_cb=None):
 479         if score:
 480             self.score = score
 481         else:
 482             self.score = scoreDocs.score
 483
 484         self.hits = []
 485
 486         stored = searcher.doc(scoreDocs.doc)
 487         self.book_id = int(stored.get("book_id"))
 488
 489         header_type = stored.get("header_type")
 490         sec = (header_type, int(stored.get("header_index")))
 491         header_span = stored.get('header_span')
 492         header_span = header_span is not None and int(header_span) or 1
 493         stored = searcher.doc(scoreDocs.doc)
 494         self.book_id = int(stored.get("book_id"))
 495
 496         fragment = stored.get("fragment_anchor")
 497
 498         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets_cb': snippets_cb})
 499
 500         self.hits.append(hit)
 501
 502     def merge(self, other):
 503         if self.book_id != other.book_id:
 504             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 505         self.hits += other.hits
 506         if other.score > self.score:
 507             self.score = other.score
 508         return self
 509
 510     def add_snippets(self, snippets):
 511         self.snippets += snippets
 512         return self
 513
 514     def get_book(self):
 515         return catalogue.models.Book.objects.get(id=self.book_id)
 516
 517     book = property(get_book)
 518
 519     def get_parts(self):
 520         book = self.book
 521
 522         def sections_covered(results):
 523             frags = filter(lambda r: r[1] is not None, results)
 524             sect = filter(lambda r: r[1] is None, results)
 525             sect = filter(lambda s: 0 == len(filter(
 526                 lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
 527                 frags)), sect)
 528             print "filtered, non overlapped sections: %s" % sect
 529             return frags + sect
 530
 531
 532         parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
 533             + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
 534
 535         parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
 536         print("bookid: %d parts: %s" % (self.book_id, parts))
 537         return parts
 538
 539     parts = property(get_parts)
 540
 541
 542     def __unicode__(self):
 543         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 544
 545     @staticmethod
 546     def aggregate(*result_lists):
 547         books = {}
 548         for rl in result_lists:
 549             for r in rl:
 550                 if r.book_id in books:
 551                     books[r.book_id].merge(r)
 552                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 553                 else:
 554                     books[r.book_id] = r
 555         return books.values()
 556
 557     def __cmp__(self, other):
 558         return cmp(self.score, other.score)
 559
 560
 561 class MultiSearch(Search):
 562     """Class capable of IMDb-like searching"""
 563     def get_tokens(self, searched, field='content'):
 564         """returns tokens analyzed by a proper (for a field) analyzer
 565         argument can be: StringReader, string/unicode, or tokens. In the last case
 566         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 567         """
 568         if isinstance(searched, str) or isinstance(searched, unicode):
 569             searched = StringReader(searched)
 570         elif isinstance(searched, list):
 571             return searched
 572
 573         searched.reset()
 574         tokens = self.analyzer.reusableTokenStream(field, searched)
 575         toks = []
 576         while tokens.incrementToken():
 577             cta = tokens.getAttribute(CharTermAttribute.class_)
 578             toks.append(cta.toString())
 579         return toks
 580
 581     def fuzziness(self, fuzzy):
 582         if not fuzzy:
 583             return None
 584         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 585             return fuzzy
 586         else:
 587             return 0.5
 588
 589     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 590         if fuzzy:
 591             phrase = MultiPhraseQuery()
 592             for t in tokens:
 593                 term = Term(field, t)
 594                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 595                 fuzzterms = []
 596
 597                 while True:
 598                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 599                     ft = fuzzterm.term()
 600                     if ft:
 601                         fuzzterms.append(ft)
 602                     if not fuzzterm.next(): break
 603                 if fuzzterms:
 604                     phrase.add(JArray('object')(fuzzterms, Term))
 605                 else:
 606                     phrase.add(term)
 607         else:
 608             phrase = PhraseQuery()
 609             phrase.setSlop(slop)
 610             for t in tokens:
 611                 term = Term(field, t)
 612                 phrase.add(term)
 613         return phrase
 614
 615     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 616         q = BooleanQuery()
 617         for t in tokens:
 618             term = Term(field, t)
 619             if fuzzy:
 620                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 621             else:
 622                 term = TermQuery(term)
 623             q.add(BooleanClause(term, modal))
 624         return q
 625
 626     def content_query(self, query):
 627         return BlockJoinQuery(query, self.parent_filter,
 628                               BlockJoinQuery.ScoreMode.Total)
 629
 630     def search_perfect_book(self, searched, max_results=20, fuzzy=False):
 631         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in ['author', 'title']]
 632
 633         books = []
 634         for q in qrys:
 635             top = self.searcher.search(q, max_results)
 636             for found in top.scoreDocs:
 637                 books.append(SearchResult(self.searcher, found))
 638         return books
 639
 640     def search_perfect_parts(self, searched, max_results=20, fuzzy=False):
 641         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 642
 643         books = []
 644         for q in qrys:
 645             top = self.searcher.search(q, max_results)
 646             for found in top.scoreDocs:
 647                 books.append(SearchResult(self.searcher, found).add_snippets(self.get_snippets(found, q)))
 648
 649         return books
 650
 651     def search_everywhere(self, searched, max_results=20, fuzzy=False):
 652         books = []
 653
 654         # content only query : themes x content
 655         q = BooleanQuery()
 656
 657         tokens = self.get_tokens(searched)
 658         q.add(BooleanClause(self.make_term_query(tokens, field='themes', fuzzy=fuzzy), BooleanClause.Occur.MUST))
 659         q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 660
 661         topDocs = self.searcher.search(q, max_results)
 662         for found in topDocs.scoreDocs:
 663             books.append(SearchResult(self.searcher, found))
 664
 665         # joined query themes/content x author/title/epochs/genres/kinds
 666         q = BooleanQuery()
 667         in_meta = BooleanQuery()
 668         in_content = BooleanQuery()
 669
 670         for fld in ['themes', 'content']:
 671             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 672
 673         in_meta.add(BooleanClause(self.make_term_query(
 674             self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 675
 676         for fld in ['title', 'epochs', 'genres', 'kinds']:
 677             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 678
 679         q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
 680         in_content_join = self.content_query(in_content)
 681         q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
 682         #        import pdb; pdb.set_trace()
 683         collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
 684
 685         self.searcher.search(q, collector)
 686
 687         top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
 688         if top_groups:
 689             for grp in top_groups.groups:
 690                 for part in grp.scoreDocs:
 691                     books.append(SearchResult(self.searcher, part, score=grp.maxScore))
 692         return books
 693
 694     def multisearch(self, query, max_results=50):
 695         """
 696         Search strategy:
 697         - (phrase) OR -> content
 698                       -> title
 699                       -> author
 700         - (keywords)  -> author
 701                       -> motyw
 702                       -> tags
 703                       -> content
 704         """
 705         # queryreader = StringReader(query)
 706         # tokens = self.get_tokens(queryreader)
 707
 708         # top_level = BooleanQuery()
 709         # Should = BooleanClause.Occur.SHOULD
 710
 711         # phrase_level = BooleanQuery()
 712         # phrase_level.setBoost(1.3)
 713
 714         # p_content = self.make_phrase(tokens, joined=True)
 715         # p_title = self.make_phrase(tokens, 'title')
 716         # p_author = self.make_phrase(tokens, 'author')
 717
 718         # phrase_level.add(BooleanClause(p_content, Should))
 719         # phrase_level.add(BooleanClause(p_title, Should))
 720         # phrase_level.add(BooleanClause(p_author, Should))
 721
 722         # kw_level = BooleanQuery()
 723
 724         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
 725         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
 726         # kw_level.add(j_themes, Should)
 727         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 728         # j_con = self.make_term_query(tokens, joined=True)
 729         # kw_level.add(j_con, Should)
 730
 731         # top_level.add(BooleanClause(phrase_level, Should))
 732         # top_level.add(BooleanClause(kw_level, Should))
 733
 734         return None
 735
 736     def do_search(self, query, max_results=50, collector=None):
 737         tops = self.searcher.search(query, max_results)
 738         #tops = self.searcher.search(p_content, max_results)
 739
 740         bks = []
 741         for found in tops.scoreDocs:
 742             doc = self.searcher.doc(found.doc)
 743             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
 744             bks.append(b)
 745             print "%s (%d) -> %f" % (b, b.id, found.score)
 746         return (bks, tops.totalHits)
 747
 748     def get_snippets(self, scoreDoc, query, field='content'):
 749         htmlFormatter = SimpleHTMLFormatter()
 750         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
 751
 752         stored = self.searcher.doc(scoreDoc.doc)
 753
 754         # locate content.
 755         snippets = Snippets(stored.get('book_id')).open()
 756         try:
 757             text = snippets.get(stored.get('snippets_position'), stored.get('snippets_length'))
 758         finally:
 759             snippets.close()
 760
 761         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
 762         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
 763         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 764         print('snips: %s' % snip)
 765
 766         return [snip]