apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 import catalogue.models
  29 from multiprocessing.pool import ThreadPool
  30 from threading import current_thread
  31 import atexit
  32 import traceback
  33
  34
  35 class WLAnalyzer(PerFieldAnalyzerWrapper):
  36     def __init__(self):
  37         polish = PolishAnalyzer(Version.LUCENE_34)
  38         #        polish_gap.setPositionIncrementGap(999)
  39
  40         simple = SimpleAnalyzer(Version.LUCENE_34)
  41         #        simple_gap.setPositionIncrementGap(999)
  42
  43         keyword = KeywordAnalyzer(Version.LUCENE_34)
  44
  45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  46
  47         PerFieldAnalyzerWrapper.__init__(self, polish)
  48
  49         self.addAnalyzer("tags", simple)
  50         self.addAnalyzer("technical_editors", simple)
  51         self.addAnalyzer("editors", simple)
  52         self.addAnalyzer("url", keyword)
  53         self.addAnalyzer("source_url", keyword)
  54         self.addAnalyzer("source_name", simple)
  55         self.addAnalyzer("publisher", simple)
  56         self.addAnalyzer("author", simple)
  57         self.addAnalyzer("is_book", keyword)
  58
  59         self.addAnalyzer("themes", simple)
  60         self.addAnalyzer("themes_pl", polish)
  61
  62         self.addAnalyzer("tag_name", simple)
  63         self.addAnalyzer("tag_name_pl", polish)
  64
  65         self.addAnalyzer("KEYWORD", keyword)
  66         self.addAnalyzer("SIMPLE", simple)
  67         self.addAnalyzer("POLISH", polish)
  68
  69
  70 class IndexStore(object):
  71     def __init__(self):
  72         self.make_index_dir()
  73         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  74
  75     def make_index_dir(self):
  76         try:
  77             os.makedirs(settings.SEARCH_INDEX)
  78         except OSError as exc:
  79             if exc.errno == errno.EEXIST:
  80                 pass
  81             else: raise
  82
  83
  84 class IndexChecker(IndexStore):
  85     def __init__(self):
  86         IndexStore.__init__(self)
  87
  88     def check(self):
  89         checker = CheckIndex(self.store)
  90         status = checker.checkIndex()
  91         return status
  92
  93
  94 class Snippets(object):
  95     SNIPPET_DIR = "snippets"
  96
  97     def __init__(self, book_id):
  98         try:
  99             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 100         except OSError as exc:
 101             if exc.errno == errno.EEXIST:
 102                 pass
 103             else: raise
 104         self.book_id = book_id
 105         self.file = None
 106
 107     def open(self, mode='r'):
 108         if not 'b' in mode:
 109             mode += 'b'
 110         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 111         self.position = 0
 112         return self
 113
 114     def add(self, snippet):
 115         txt = snippet.encode('utf-8')
 116         l = len(txt)
 117         self.file.write(txt)
 118         pos = (self.position, l)
 119         self.position += l
 120         print "Snip<%s>%s</s>" %(pos, txt)
 121         return pos
 122
 123     def get(self, pos):
 124         self.file.seek(pos[0], 0)
 125         txt = self.file.read(pos[1]).decode('utf-8')
 126         print "got from snippets %d bytes from %s:" % (len(txt), pos)
 127         return txt
 128
 129     def close(self):
 130         self.file.close()
 131
 132
 133 class Index(IndexStore):
 134     def __init__(self, analyzer=None):
 135         IndexStore.__init__(self)
 136         self.index = None
 137         if not analyzer:
 138             analyzer = WLAnalyzer()
 139         self.analyzer = analyzer
 140
 141     def open(self, analyzer=None):
 142         if self.index:
 143             raise Exception("Index is already opened")
 144         self.index = IndexWriter(self.store, self.analyzer,\
 145                                  IndexWriter.MaxFieldLength.LIMITED)
 146         return self.index
 147
 148     def optimize(self):
 149         self.index.optimize()
 150
 151     def close(self):
 152         try:
 153             self.index.optimize()
 154         except JavaError, je:
 155             print "Error during optimize phase, check index: %s" % je
 156
 157         self.index.close()
 158         self.index = None
 159
 160     def index_tags(self):
 161         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 162         self.index.deleteDocuments(q)
 163
 164         for tag in catalogue.models.Tag.objects.all():
 165             doc = Document()
 166             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
 167             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 168             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 169             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 170             self.index.addDocument(doc)
 171
 172     def remove_book(self, book):
 173         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 174         self.index.deleteDocuments(q)
 175
 176     def index_book(self, book, overwrite=True):
 177         if overwrite:
 178             self.remove_book(book)
 179
 180         book_doc = self.create_book_doc(book)
 181         meta_fields = self.extract_metadata(book)
 182         for f in meta_fields.values():
 183             if isinstance(f, list) or isinstance(f, tuple):
 184                 for elem in f:
 185                     book_doc.add(elem)
 186             else:
 187                 book_doc.add(f)
 188
 189         self.index.addDocument(book_doc)
 190         del book_doc
 191
 192         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['author']])
 193
 194     master_tags = [
 195         'opowiadanie',
 196         'powiesc',
 197         'dramat_wierszowany_l',
 198         'dramat_wierszowany_lp',
 199         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 200         'wywiad'
 201         ]
 202
 203     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 204
 205     def create_book_doc(self, book):
 206         """
 207         Create a lucene document connected to the book
 208         """
 209         doc = Document()
 210         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 211         if book.parent is not None:
 212             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 213         return doc
 214
 215     def extract_metadata(self, book):
 216         fields = {}
 217         book_info = dcparser.parse(book.xml_file)
 218
 219         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
 220
 221         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 222         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 223         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 224
 225         # validator, name
 226         for field in dcparser.BookInfo.FIELDS:
 227             if hasattr(book_info, field.name):
 228                 if not getattr(book_info, field.name):
 229                     continue
 230                 # since no type information is available, we use validator
 231                 type_indicator = field.validator
 232                 if type_indicator == dcparser.as_unicode:
 233                     s = getattr(book_info, field.name)
 234                     if field.multiple:
 235                         s = ', '.join(s)
 236                     try:
 237                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 238                     except JavaError as je:
 239                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 240                 elif type_indicator == dcparser.as_person:
 241                     p = getattr(book_info, field.name)
 242                     if isinstance(p, dcparser.Person):
 243                         persons = unicode(p)
 244                     else:
 245                         persons = ', '.join(map(unicode, p))
 246                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 247                 elif type_indicator == dcparser.as_date:
 248                     dt = getattr(book_info, field.name)
 249                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 250                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 251         return fields
 252
 253     def get_master(self, root):
 254         for master in root.iter():
 255             if master.tag in self.master_tags:
 256                 return master
 257
 258     def add_gaps(self, fields, fieldname):
 259         def gap():
 260             while True:
 261                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 262         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 263
 264     def index_content(self, book, book_fields=[]):
 265         wld = WLDocument.from_file(book.xml_file.path)
 266         root = wld.edoc.getroot()
 267
 268         master = self.get_master(root)
 269         if master is None:
 270             return []
 271
 272         def walker(node):
 273             yield node, None
 274             for child in list(node):
 275                 for b, e in walker(child):
 276                     yield b, e
 277             yield None, node
 278             return
 279
 280         def fix_format(text):
 281             return re.sub("/$", "", text, flags=re.M)
 282
 283         def add_part(snippets, **fields):
 284             doc = self.create_book_doc(book)
 285             for f in book_fields:
 286                 doc.add(f)
 287
 288             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 289             doc.add(NumericField("header_span", Field.Store.YES, True)\
 290                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 291             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 292
 293             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 294                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 295
 296             snip_pos = snippets.add(fields["content"])
 297             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 298             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 299
 300             if 'fragment_anchor' in fields:
 301                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 302                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 303
 304             if 'themes' in fields:
 305                 themes, themes_pl = zip(*[
 306                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 307                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 308                      for theme in fields['themes']])
 309
 310                 themes = self.add_gaps(themes, 'themes')
 311                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 312
 313                 for t in themes:
 314                     doc.add(t)
 315                 for t in themes_pl:
 316                     doc.add(t)
 317
 318             return doc
 319
 320         fragments = {}
 321         snippets = Snippets(book.id).open('w')
 322         try:
 323             for header, position in zip(list(master), range(len(master))):
 324
 325                 if header.tag in self.skip_header_tags:
 326                     continue
 327
 328                 content = u' '.join([t for t in header.itertext()])
 329                 content = fix_format(content)
 330
 331                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
 332
 333                 self.index.addDocument(doc)
 334
 335                 for start, end in walker(header):
 336                     if start is not None and start.tag == 'begin':
 337                         fid = start.attrib['id'][1:]
 338                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 339                         fragments[fid]['content'].append(start.tail)
 340                     elif start is not None and start.tag == 'motyw':
 341                         fid = start.attrib['id'][1:]
 342                         fragments[fid]['themes'].append(start.text)
 343                         fragments[fid]['content'].append(start.tail)
 344                     elif start is not None and start.tag == 'end':
 345                         fid = start.attrib['id'][1:]
 346                         if fid not in fragments:
 347                             continue  # a broken <end> node, skip it
 348                         frag = fragments[fid]
 349                         del fragments[fid]
 350
 351                         def jstr(l):
 352                             return u' '.join(map(
 353                                 lambda x: x == None and u'(none)' or unicode(x),
 354                                 l))
 355
 356                         doc = add_part(snippets,
 357                                        header_type=frag['start_header'],
 358                                        header_index=frag['start_section'],
 359                                        header_span=position - frag['start_section'] + 1,
 360                                        fragment_anchor=fid,
 361                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 362                                        themes=frag['themes'])
 363
 364                         self.index.addDocument(doc)
 365                     elif start is not None:
 366                         for frag in fragments.values():
 367                             frag['content'].append(start.text)
 368                     elif end is not None:
 369                         for frag in fragments.values():
 370                             frag['content'].append(end.tail)
 371         finally:
 372             snippets.close()
 373
 374
 375     def __enter__(self):
 376         self.open()
 377         return self
 378
 379     def __exit__(self, type, value, tb):
 380         self.close()
 381
 382
 383 def log_exception_wrapper(f):
 384     def _wrap(*a):
 385         try:
 386             f(*a)
 387         except Exception, e:
 388             print("Error in indexing thread: %s" % e)
 389             traceback.print_exc()
 390             raise e
 391     return _wrap
 392
 393
 394 class ReusableIndex(Index):
 395     """
 396     Works like index, but does not close/optimize Lucene index
 397     until program exit (uses atexit hook).
 398     This is usefull for importbooks command.
 399
 400     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 401     """
 402     index = None
 403     pool = None
 404     pool_jobs = None
 405
 406     def open(self, analyzer=None, threads=4):
 407         if ReusableIndex.index is not None:
 408             self.index = ReusableIndex.index
 409         else:
 410             print("opening index")
 411             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
 412             ReusableIndex.pool_jobs = []
 413             Index.open(self, analyzer)
 414             ReusableIndex.index = self.index
 415             atexit.register(ReusableIndex.close_reusable)
 416
 417     def index_book(self, *args, **kw):
 418         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 419         ReusableIndex.pool_jobs.append(job)
 420
 421     @staticmethod
 422     def close_reusable():
 423         if ReusableIndex.index is not None:
 424             print("wait for indexing to finish")
 425             for job in ReusableIndex.pool_jobs:
 426                 job.get()
 427                 sys.stdout.write('.')
 428                 sys.stdout.flush()
 429             print("done.")
 430             ReusableIndex.pool.close()
 431
 432             ReusableIndex.index.optimize()
 433             ReusableIndex.index.close()
 434             ReusableIndex.index = None
 435
 436     def close(self):
 437         pass
 438
 439
 440 class Search(IndexStore):
 441     def __init__(self, default_field="content"):
 442         IndexStore.__init__(self)
 443         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
 444         ## self.analyzer = WLAnalyzer()
 445         self.searcher = IndexSearcher(self.store, True)
 446         self.parser = QueryParser(Version.LUCENE_34, default_field,
 447                                   self.analyzer)
 448
 449         self.parent_filter = TermsFilter()
 450         self.parent_filter.addTerm(Term("is_book", "true"))
 451
 452     def query(self, query):
 453         return self.parser.parse(query)
 454
 455     def wrapjoins(self, query, fields=[]):
 456         """
 457         This functions modifies the query in a recursive way,
 458         so Term and Phrase Queries contained, which match
 459         provided fields are wrapped in a BlockJoinQuery,
 460         and so delegated to children documents.
 461         """
 462         if BooleanQuery.instance_(query):
 463             qs = BooleanQuery.cast_(query)
 464             for clause in qs:
 465                 clause = BooleanClause.cast_(clause)
 466                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 467             return qs
 468         else:
 469             termset = HashSet()
 470             query.extractTerms(termset)
 471             for t in termset:
 472                 t = Term.cast_(t)
 473                 if t.field() not in fields:
 474                     return query
 475             return BlockJoinQuery(query, self.parent_filter,
 476                                   BlockJoinQuery.ScoreMode.Total)
 477
 478     def simple_search(self, query, max_results=50):
 479         """Returns (books, total_hits)
 480         """
 481
 482         tops = self.searcher.search(self.query(query), max_results)
 483         bks = []
 484         for found in tops.scoreDocs:
 485             doc = self.searcher.doc(found.doc)
 486             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 487         return (bks, tops.totalHits)
 488
 489     def search(self, query, max_results=50):
 490         query = self.query(query)
 491         query = self.wrapjoins(query, ["content", "themes"])
 492
 493         tops = self.searcher.search(query, max_results)
 494         bks = []
 495         for found in tops.scoreDocs:
 496             doc = self.searcher.doc(found.doc)
 497             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 498         return (bks, tops.totalHits)
 499
 500     def bsearch(self, query, max_results=50):
 501         q = self.query(query)
 502         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 503
 504         tops = self.searcher.search(bjq, max_results)
 505         bks = []
 506         for found in tops.scoreDocs:
 507             doc = self.searcher.doc(found.doc)
 508             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 509         return (bks, tops.totalHits)
 510
 511 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
 512 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
 513 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
 514
 515 # while (tokenStream.incrementToken()) {
 516 #     int startOffset = offsetAttribute.startOffset();
 517 #     int endOffset = offsetAttribute.endOffset();
 518 #     String term = charTermAttribute.toString();
 519 # }
 520
 521
 522 class SearchResult(object):
 523     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
 524         self.snippets = []
 525
 526         if score:
 527             self.score = score
 528         else:
 529             self.score = scoreDocs.score
 530
 531         self.hits = []
 532
 533         stored = searcher.doc(scoreDocs.doc)
 534         self.book_id = int(stored.get("book_id"))
 535
 536         header_type = stored.get("header_type")
 537         if not header_type:
 538             return
 539
 540         sec = (header_type, int(stored.get("header_index")))
 541         header_span = stored.get('header_span')
 542         header_span = header_span is not None and int(header_span) or 1
 543
 544         fragment = stored.get("fragment_anchor")
 545
 546         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets})
 547
 548         self.hits.append(hit)
 549
 550     def merge(self, other):
 551         if self.book_id != other.book_id:
 552             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 553         self.hits += other.hits
 554         if other.score > self.score:
 555             self.score = other.score
 556         return self
 557
 558     def get_book(self):
 559         return catalogue.models.Book.objects.get(id=self.book_id)
 560
 561     book = property(get_book)
 562
 563     def get_parts(self):
 564         book = self.book
 565
 566         def sections_covered(results):
 567             frags = filter(lambda r: r[1] is not None, results)
 568             sect = filter(lambda r: r[1] is None, results)
 569             sect = filter(lambda s: 0 == len(filter(
 570                 lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
 571                 frags)), sect)
 572             print "filtered, non overlapped sections: %s" % sect
 573             return frags + sect
 574
 575         parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
 576             + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
 577
 578         parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
 579         print("bookid: %d parts: %s" % (self.book_id, parts))
 580         return parts
 581
 582     parts = property(get_parts)
 583
 584     def __unicode__(self):
 585         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 586
 587     @staticmethod
 588     def aggregate(*result_lists):
 589         books = {}
 590         for rl in result_lists:
 591             for r in rl:
 592                 if r.book_id in books:
 593                     books[r.book_id].merge(r)
 594                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 595                 else:
 596                     books[r.book_id] = r
 597         return books.values()
 598
 599     def __cmp__(self, other):
 600         return cmp(self.score, other.score)
 601
 602
 603 class Hint(object):
 604     def __init__(self, search):
 605         self.search = search
 606         self.book_tags = {}
 607         self.part_tags = []
 608         self.book = None
 609
 610     def book(self, book):
 611         self.book = book
 612
 613     def tags(self, tags):
 614         for t in tags:
 615             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 616                 lst = self.book_tags.get(t.category, [])
 617                 lst.append(t)
 618                 self.book_tags[t.category] = lst
 619             if t.category in ['theme']:
 620                 self.part_tags.append(t)
 621
 622     def tag_filter(self, tags, field='tags'):
 623         q = BooleanQuery()
 624
 625         for tag in tags:
 626             toks = self.search.get_tokens(tag.name, field=field)
 627             tag_phrase = PhraseQuery()
 628             for tok in toks:
 629                 tag_phrase.add(Term(field, tok))
 630             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 631
 632         return QueryWrapperFilter(q)
 633
 634     def book_filter(self):
 635         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 636         if tags:
 637             return self.tag_filter(tags)
 638         else:
 639             return None
 640
 641     def part_filter(self):
 642         fs = []
 643         if self.part_tags:
 644             fs.append(self.tag_filter(self.part_tags, field='themes'))
 645         if self.book is not None:
 646             bf = TermsFilter()
 647             bf.addTerm # TODO
 648
 649     def should_search_for_book(self):
 650         return self.book is None
 651
 652     def just_search_in(self, all):
 653         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 654         some = []
 655         for field in all:
 656             if field == 'author' and 'author' in self.book_tags:
 657                 continue
 658             if field == 'title' and self.book is not None:
 659                 continue
 660             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 661                 continue
 662             some.append(field)
 663         return some
 664
 665
 666 class MultiSearch(Search):
 667     """Class capable of IMDb-like searching"""
 668     def get_tokens(self, searched, field='content'):
 669         """returns tokens analyzed by a proper (for a field) analyzer
 670         argument can be: StringReader, string/unicode, or tokens. In the last case
 671         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 672         """
 673         if isinstance(searched, str) or isinstance(searched, unicode):
 674             searched = StringReader(searched)
 675         elif isinstance(searched, list):
 676             return searched
 677
 678         searched.reset()
 679         tokens = self.analyzer.reusableTokenStream(field, searched)
 680         toks = []
 681         while tokens.incrementToken():
 682             cta = tokens.getAttribute(CharTermAttribute.class_)
 683             toks.append(cta.toString())
 684         return toks
 685
 686     def fuzziness(self, fuzzy):
 687         if not fuzzy:
 688             return None
 689         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 690             return fuzzy
 691         else:
 692             return 0.5
 693
 694     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 695         if fuzzy:
 696             phrase = MultiPhraseQuery()
 697             for t in tokens:
 698                 term = Term(field, t)
 699                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 700                 fuzzterms = []
 701
 702                 while True:
 703                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 704                     ft = fuzzterm.term()
 705                     if ft:
 706                         fuzzterms.append(ft)
 707                     if not fuzzterm.next(): break
 708                 if fuzzterms:
 709                     phrase.add(JArray('object')(fuzzterms, Term))
 710                 else:
 711                     phrase.add(term)
 712         else:
 713             phrase = PhraseQuery()
 714             phrase.setSlop(slop)
 715             for t in tokens:
 716                 term = Term(field, t)
 717                 phrase.add(term)
 718         return phrase
 719
 720     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 721         q = BooleanQuery()
 722         for t in tokens:
 723             term = Term(field, t)
 724             if fuzzy:
 725                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 726             else:
 727                 term = TermQuery(term)
 728             q.add(BooleanClause(term, modal))
 729         return q
 730
 731     def content_query(self, query):
 732         return BlockJoinQuery(query, self.parent_filter,
 733                               BlockJoinQuery.ScoreMode.Total)
 734
 735     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 736         fields_to_search = ['author', 'title']
 737         only_in = None
 738         if hint:
 739             if not hint.should_search_for_book():
 740                 return []
 741             fields_to_search = hint.just_search_in(fields_to_search)
 742             only_in = hint.book_filter()
 743
 744         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 745
 746         books = []
 747         for q in qrys:
 748             top = self.searcher.search(q,
 749                                        self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 750                 max_results)
 751             for found in top.scoreDocs:
 752                 books.append(SearchResult(self.searcher, found))
 753         return books
 754
 755     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 756         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 757
 758         flt = None
 759         if hint:
 760             flt = hint.part_filter()
 761
 762         books = []
 763         for q in qrys:
 764             top = self.searcher.search(q,
 765                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 766                                                            flt
 767                                                           ]),
 768                                        max_results)
 769             for found in top.scoreDocs:
 770                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
 771
 772         return books
 773
 774     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
 775         books = []
 776         only_in = None
 777
 778         if hint:
 779             only_in = hint.part_filter()
 780
 781         # content only query : themes x content
 782         q = BooleanQuery()
 783
 784         tokens = self.get_tokens(searched)
 785         if hint is None or hint.just_search_in(['themes_pl']) != []:
 786             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl', fuzzy=fuzzy), BooleanClause.Occur.MUST))
 787
 788         q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 789
 790         topDocs = self.searcher.search(q, only_in, max_results)
 791         for found in topDocs.scoreDocs:
 792             books.append(SearchResult(self.searcher, found))
 793
 794         # joined query themes/content x author/title/epochs/genres/kinds
 795         # q = BooleanQuery()
 796         # in_meta = BooleanQuery()
 797         # in_content = BooleanQuery()
 798
 799         # for fld in ['themes', 'content']:
 800         #     in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 801
 802         # in_meta.add(BooleanClause(self.make_term_query(
 803         #     self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 804
 805         # for fld in ['title', 'epochs', 'genres', 'kinds']:
 806         #     in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 807
 808         # q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
 809         # in_content_join = self.content_query(in_content)
 810         # q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
 811         # #        import pdb; pdb.set_trace()
 812         # collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
 813
 814         # self.searcher.search(q, collector)
 815
 816         # top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
 817         # if top_groups:
 818         #     for grp in top_groups.groups:
 819         #         for part in grp.scoreDocs:
 820         #             books.append(SearchResult(self.searcher, part, score=grp.maxScore))
 821         return books
 822
 823     def multisearch(self, query, max_results=50):
 824         """
 825         Search strategy:
 826         - (phrase) OR -> content
 827                       -> title
 828                       -> author
 829         - (keywords)  -> author
 830                       -> motyw
 831                       -> tags
 832                       -> content
 833         """
 834         # queryreader = StringReader(query)
 835         # tokens = self.get_tokens(queryreader)
 836
 837         # top_level = BooleanQuery()
 838         # Should = BooleanClause.Occur.SHOULD
 839
 840         # phrase_level = BooleanQuery()
 841         # phrase_level.setBoost(1.3)
 842
 843         # p_content = self.make_phrase(tokens, joined=True)
 844         # p_title = self.make_phrase(tokens, 'title')
 845         # p_author = self.make_phrase(tokens, 'author')
 846
 847         # phrase_level.add(BooleanClause(p_content, Should))
 848         # phrase_level.add(BooleanClause(p_title, Should))
 849         # phrase_level.add(BooleanClause(p_author, Should))
 850
 851         # kw_level = BooleanQuery()
 852
 853         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
 854         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
 855         # kw_level.add(j_themes, Should)
 856         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 857         # j_con = self.make_term_query(tokens, joined=True)
 858         # kw_level.add(j_con, Should)
 859
 860         # top_level.add(BooleanClause(phrase_level, Should))
 861         # top_level.add(BooleanClause(kw_level, Should))
 862
 863         return None
 864
 865     def book_search(self, query, filter=None, max_results=50, collector=None):
 866         tops = self.searcher.search(query, filter, max_results)
 867         #tops = self.searcher.search(p_content, max_results)
 868
 869         bks = []
 870         for found in tops.scoreDocs:
 871             doc = self.searcher.doc(found.doc)
 872             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
 873             bks.append(b)
 874             print "%s (%d) -> %f" % (b, b.id, found.score)
 875         return bks
 876
 877     def get_snippets(self, scoreDoc, query, field='content'):
 878         htmlFormatter = SimpleHTMLFormatter()
 879         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
 880
 881         stored = self.searcher.doc(scoreDoc.doc)
 882
 883         # locate content.
 884         snippets = Snippets(stored.get('book_id')).open()
 885         try:
 886             text = snippets.get((int(stored.get('snippets_position')),
 887                                  int(stored.get('snippets_length'))))
 888         finally:
 889             snippets.close()
 890
 891         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
 892         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
 893         #        import pdb; pdb.set_trace()
 894         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 895         print('snips: %s' % snip)
 896
 897         return [snip]
 898
 899     @staticmethod
 900     def enum_to_array(enum):
 901         """
 902         Converts a lucene TermEnum to array of Terms, suitable for
 903         addition to queries
 904         """
 905         terms = []
 906
 907         while True:
 908             t = enum.term()
 909             if t:
 910                 terms.append(t)
 911             if not enum.next(): break
 912
 913         if terms:
 914             return JArray('object')(terms, Term)
 915
 916     def search_tags(self, query, filter=None, max_results=40):
 917         tops = self.searcher.search(query, filter, max_results)
 918
 919         tags = []
 920         for found in tops.scoreDocs:
 921             doc = self.searcher.doc(found.doc)
 922             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 923             tags.append(tag)
 924             print "%s (%d) -> %f" % (tag, tag.id, found.score)
 925
 926         return tags
 927
 928     def create_prefix_phrase(self, toks, field):
 929         q = MultiPhraseQuery()
 930         for i in range(len(toks)):
 931             t = Term(field, toks[i])
 932             if i == len(toks) - 1:
 933                 pterms = MultiSearch.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
 934                 if pterms:
 935                     q.add(pterms)
 936                 else:
 937                     q.add(t)
 938             else:
 939                 q.add(t)
 940         return q
 941
 942     @staticmethod
 943     def term_filter(term, inverse=False):
 944         only_term = TermsFilter()
 945         only_term.addTerm(term)
 946
 947         if inverse:
 948             neg = BooleanFilter()
 949             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
 950             only_term = neg
 951
 952         return only_term
 953
 954     def hint_tags(self, string, max_results=50):
 955         toks = self.get_tokens(string, field='SIMPLE')
 956         top = BooleanQuery()
 957
 958         for field in ['tag_name', 'tag_name_pl']:
 959             q = self.create_prefix_phrase(toks, field)
 960             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
 961
 962         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
 963
 964         return self.search_tags(top, no_book_cat, max_results=max_results)
 965
 966     def hint_books(self, string, max_results=50):
 967         toks = self.get_tokens(string, field='SIMPLE')
 968
 969         q = self.create_prefix_phrase(toks, 'title')
 970
 971         return self.book_search(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
 972
 973     @staticmethod
 974     def chain_filters(filters, op=ChainedFilter.AND):
 975         filters = filter(lambda x: x is not None, filters)
 976         if not filters:
 977             return None
 978         chf = ChainedFilter(JArray('object')(filters, Filter), op)
 979         return chf
 980
 981     def filtered_categories(self, tags):
 982         cats = {}
 983         for t in tags:
 984             cats[t.category] = True
 985         return cats.keys()
 986
 987     def hint(self):
 988         return Hint(self)