apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from multiprocessing.pool import ThreadPool
  31 from threading import current_thread
  32 import atexit
  33 import traceback
  34
  35
  36 class WLAnalyzer(PerFieldAnalyzerWrapper):
  37     def __init__(self):
  38         polish = PolishAnalyzer(Version.LUCENE_34)
  39         #        polish_gap.setPositionIncrementGap(999)
  40
  41         simple = SimpleAnalyzer(Version.LUCENE_34)
  42         #        simple_gap.setPositionIncrementGap(999)
  43
  44         keyword = KeywordAnalyzer(Version.LUCENE_34)
  45
  46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  47
  48         PerFieldAnalyzerWrapper.__init__(self, polish)
  49
  50         self.addAnalyzer("tags", simple)
  51         self.addAnalyzer("technical_editors", simple)
  52         self.addAnalyzer("editors", simple)
  53         self.addAnalyzer("url", keyword)
  54         self.addAnalyzer("source_url", keyword)
  55         self.addAnalyzer("source_name", simple)
  56         self.addAnalyzer("publisher", simple)
  57         self.addAnalyzer("authors", simple)
  58         self.addAnalyzer("title", simple)
  59
  60         self.addAnalyzer("is_book", keyword)
  61         # shouldn't the title have two forms? _pl and simple?
  62
  63         self.addAnalyzer("themes", simple)
  64         self.addAnalyzer("themes_pl", polish)
  65
  66         self.addAnalyzer("tag_name", simple)
  67         self.addAnalyzer("tag_name_pl", polish)
  68
  69         self.addAnalyzer("translators", simple)
  70
  71         self.addAnalyzer("KEYWORD", keyword)
  72         self.addAnalyzer("SIMPLE", simple)
  73         self.addAnalyzer("POLISH", polish)
  74
  75
  76 class IndexStore(object):
  77     """
  78     Provides access to search index.
  79
  80     self.store - lucene index directory
  81     """
  82     def __init__(self):
  83         self.make_index_dir()
  84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  85
  86     def make_index_dir(self):
  87         try:
  88             os.makedirs(settings.SEARCH_INDEX)
  89         except OSError as exc:
  90             if exc.errno == errno.EEXIST:
  91                 pass
  92             else: raise
  93
  94
  95 class IndexChecker(IndexStore):
  96     def __init__(self):
  97         IndexStore.__init__(self)
  98
  99     def check(self):
 100         checker = CheckIndex(self.store)
 101         status = checker.checkIndex()
 102         return status
 103
 104
 105 class Snippets(object):
 106     """
 107     This class manages snippet files for indexed object (book)
 108     the snippets are concatenated together, and their positions and
 109     lengths are kept in lucene index fields.
 110     """
 111     SNIPPET_DIR = "snippets"
 112
 113     def __init__(self, book_id):
 114         try:
 115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 116         except OSError as exc:
 117             if exc.errno == errno.EEXIST:
 118                 pass
 119             else: raise
 120         self.book_id = book_id
 121         self.file = None
 122
 123     def open(self, mode='r'):
 124         """
 125         Open the snippet file. Call .close() afterwards.
 126         """
 127         if not 'b' in mode:
 128             mode += 'b'
 129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 130         self.position = 0
 131         return self
 132
 133     def add(self, snippet):
 134         """
 135         Append a snippet (unicode) to the snippet file.
 136         Return a (position, length) tuple
 137         """
 138         txt = snippet.encode('utf-8')
 139         l = len(txt)
 140         self.file.write(txt)
 141         pos = (self.position, l)
 142         self.position += l
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad'
 268         ]
 269
 270     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 271
 272     def extract_metadata(self, book, book_info=None):
 273         """
 274         Extract metadata from book and returns a map of fields keyed by fieldname
 275         """
 276         fields = {}
 277
 278         if book_info is None:
 279             book_info = dcparser.parse(open(book.xml_file.path))
 280
 281         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 282         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 283         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 284
 285         # validator, name
 286         for field in dcparser.BookInfo.FIELDS:
 287             if hasattr(book_info, field.name):
 288                 if not getattr(book_info, field.name):
 289                     continue
 290                 # since no type information is available, we use validator
 291                 type_indicator = field.validator
 292                 if type_indicator == dcparser.as_unicode:
 293                     s = getattr(book_info, field.name)
 294                     if field.multiple:
 295                         s = ', '.join(s)
 296                     try:
 297                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 298                     except JavaError as je:
 299                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 300                 elif type_indicator == dcparser.as_person:
 301                     p = getattr(book_info, field.name)
 302                     if isinstance(p, dcparser.Person):
 303                         persons = unicode(p)
 304                     else:
 305                         persons = ', '.join(map(unicode, p))
 306                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 307                 elif type_indicator == dcparser.as_date:
 308                     dt = getattr(book_info, field.name)
 309                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 310                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 311
 312         return fields
 313
 314     def add_gaps(self, fields, fieldname):
 315         """
 316         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 317         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 318         """
 319         def gap():
 320             while True:
 321                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 322         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 323
 324     def get_master(self, root):
 325         """
 326         Returns the first master tag from an etree.
 327         """
 328         for master in root.iter():
 329             if master.tag in self.master_tags:
 330                 return master
 331
 332     def index_content(self, book, book_fields=[]):
 333         """
 334         Walks the book XML and extract content from it.
 335         Adds parts for each header tag and for each fragment.
 336         """
 337         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 338         root = wld.edoc.getroot()
 339
 340         master = self.get_master(root)
 341         if master is None:
 342             return []
 343
 344         def walker(node):
 345             yield node, None
 346             for child in list(node):
 347                 for b, e in walker(child):
 348                     yield b, e
 349             yield None, node
 350             return
 351
 352         def fix_format(text):
 353             return re.sub("(?m)/$", "", text)
 354
 355         def add_part(snippets, **fields):
 356             doc = self.create_book_doc(book)
 357             for f in book_fields:
 358                 doc.add(f)
 359
 360             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 361             doc.add(NumericField("header_span", Field.Store.YES, True)\
 362                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 363             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 364
 365             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 366                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 367
 368             snip_pos = snippets.add(fields["content"])
 369             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 370             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 371
 372             if 'fragment_anchor' in fields:
 373                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 374                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 375
 376             if 'themes' in fields:
 377                 themes, themes_pl = zip(*[
 378                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 379                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 380                      for theme in fields['themes']])
 381
 382                 themes = self.add_gaps(themes, 'themes')
 383                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 384
 385                 for t in themes:
 386                     doc.add(t)
 387                 for t in themes_pl:
 388                     doc.add(t)
 389
 390             return doc
 391
 392         def give_me_utf8(s):
 393             if isinstance(s, unicode):
 394                 return s.encode('utf-8')
 395             else:
 396                 return s
 397
 398         fragments = {}
 399         snippets = Snippets(book.id).open('w')
 400         try:
 401             for header, position in zip(list(master), range(len(master))):
 402
 403                 if header.tag in self.skip_header_tags:
 404                     continue
 405                 if header.tag is etree.Comment:
 406                     continue
 407
 408                 # section content
 409                 content = []
 410
 411                 for start, end in walker(header):
 412                         # handle fragments and themes.
 413                     if start is not None and start.tag == 'begin':
 414                         fid = start.attrib['id'][1:]
 415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 416
 417                     elif start is not None and start.tag == 'motyw':
 418                         fid = start.attrib['id'][1:]
 419                         if start.text is not None:
 420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 421
 422                     elif start is not None and start.tag == 'end':
 423                         fid = start.attrib['id'][1:]
 424                         if fid not in fragments:
 425                             continue  # a broken <end> node, skip it
 426                                       #                        import pdb; pdb.set_trace()
 427                         frag = fragments[fid]
 428                         if frag['themes'] == []:
 429                             continue  # empty themes list.
 430                         del fragments[fid]
 431
 432                         def jstr(l):
 433                             return u' '.join(map(
 434                                 lambda x: x == None and u'(none)' or unicode(x),
 435                                 l))
 436
 437                         doc = add_part(snippets,
 438                                        header_type=frag['start_header'],
 439                                        header_index=frag['start_section'],
 440                                        header_span=position - frag['start_section'] + 1,
 441                                        fragment_anchor=fid,
 442                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 443                                        themes=frag['themes'])
 444
 445                         self.index.addDocument(doc)
 446
 447                         # Collect content.
 448                     elif start is not None:
 449                         for frag in fragments.values():
 450                             frag['content'].append(start.text)
 451                         content.append(start.text)
 452                     elif end is not None:
 453                         for frag in fragments.values():
 454                             frag['content'].append(end.tail)
 455                         content.append(end.tail)
 456
 457                         # in the end, add a section text.
 458                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 459                                content=fix_format(u' '.join(filter(lambda s: s is not None, content))))
 460
 461                 self.index.addDocument(doc)
 462
 463         finally:
 464             snippets.close()
 465
 466
 467 def log_exception_wrapper(f):
 468     def _wrap(*a):
 469         try:
 470             f(*a)
 471         except Exception, e:
 472             print("Error in indexing thread: %s" % e)
 473             traceback.print_exc()
 474             raise e
 475     return _wrap
 476
 477
 478 class ReusableIndex(Index):
 479     """
 480     Works like index, but does not close/optimize Lucene index
 481     until program exit (uses atexit hook).
 482     This is usefull for importbooks command.
 483
 484     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 485     """
 486     index = None
 487
 488     def open(self, analyzer=None, threads=4):
 489         if ReusableIndex.index is not None:
 490             self.index = ReusableIndex.index
 491         else:
 492             print("opening index")
 493             Index.open(self, analyzer)
 494             ReusableIndex.index = self.index
 495             atexit.register(ReusableIndex.close_reusable)
 496
 497     # def index_book(self, *args, **kw):
 498     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 499     #     ReusableIndex.pool_jobs.append(job)
 500
 501     @staticmethod
 502     def close_reusable():
 503         if ReusableIndex.index is not None:
 504             ReusableIndex.index.optimize()
 505             ReusableIndex.index.close()
 506             ReusableIndex.index = None
 507
 508     def close(self):
 509         pass
 510
 511
 512 class JoinSearch(object):
 513     """
 514     This mixin could be used to handle block join queries.
 515     (currently unused)
 516     """
 517     def __init__(self, *args, **kw):
 518         super(JoinSearch, self).__init__(*args, **kw)
 519
 520     def wrapjoins(self, query, fields=[]):
 521         """
 522         This functions modifies the query in a recursive way,
 523         so Term and Phrase Queries contained, which match
 524         provided fields are wrapped in a BlockJoinQuery,
 525         and so delegated to children documents.
 526         """
 527         if BooleanQuery.instance_(query):
 528             qs = BooleanQuery.cast_(query)
 529             for clause in qs:
 530                 clause = BooleanClause.cast_(clause)
 531                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 532             return qs
 533         else:
 534             termset = HashSet()
 535             query.extractTerms(termset)
 536             for t in termset:
 537                 t = Term.cast_(t)
 538                 if t.field() not in fields:
 539                     return query
 540             return BlockJoinQuery(query, self.parent_filter,
 541                                   BlockJoinQuery.ScoreMode.Total)
 542
 543     def bsearch(self, query, max_results=50):
 544         q = self.query(query)
 545         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 546
 547         tops = self.searcher.search(bjq, max_results)
 548         bks = []
 549         for found in tops.scoreDocs:
 550             doc = self.searcher.doc(found.doc)
 551             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 552         return (bks, tops.totalHits)
 553
 554
 555 class SearchResult(object):
 556     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
 557         if score:
 558             self.score = score
 559         else:
 560             self.score = scoreDocs.score
 561
 562         self._hits = []
 563         self.hits = None  # processed hits
 564
 565         stored = searcher.doc(scoreDocs.doc)
 566         self.book_id = int(stored.get("book_id"))
 567
 568         header_type = stored.get("header_type")
 569         if not header_type:
 570             return
 571
 572         sec = (header_type, int(stored.get("header_index")))
 573         header_span = stored.get('header_span')
 574         header_span = header_span is not None and int(header_span) or 1
 575
 576         fragment = stored.get("fragment_anchor")
 577
 578         if snippets:
 579             snippets = snippets.replace("/\n", "\n")
 580         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 581
 582         self._hits.append(hit)
 583
 584     def merge(self, other):
 585         if self.book_id != other.book_id:
 586             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 587         self._hits += other._hits
 588         if other.score > self.score:
 589             self.score = other.score
 590         return self
 591
 592     def get_book(self):
 593         return catalogue.models.Book.objects.get(id=self.book_id)
 594
 595     book = property(get_book)
 596
 597     def process_hits(self):
 598         POSITION = 0
 599         FRAGMENT = 1
 600         POSITION_INDEX = 1
 601         POSITION_SPAN = 2
 602         SCORE = 2
 603         OTHER = 3
 604
 605         # to sections and fragments
 606         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 607         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 608         sect = filter(lambda s: 0 == len(filter(
 609             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 610             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 611             frags)), sect)
 612
 613         hits = []
 614
 615         # remove duplicate fragments
 616         fragments = {}
 617         for f in frags:
 618             fid = f[FRAGMENT]
 619             if fid in fragments:
 620                 if fragments[fid][SCORE] >= f[SCORE]:
 621                     continue
 622             fragments[fid] = f
 623         frags = fragments.values()
 624
 625         # remove duplicate sections
 626         sections = {}
 627
 628         for s in sect:
 629             si = s[POSITION][POSITION_INDEX]
 630             # skip existing
 631             if si in sections:
 632                 if sections[si]['score'] >= s[SCORE]:
 633                     continue
 634
 635             m = {'score': s[SCORE],
 636                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 637                  }
 638             m.update(s[OTHER])
 639             sections[si] = m
 640
 641         hits = sections.values()
 642
 643         for f in frags:
 644             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 645             m = {'score': f[SCORE],
 646                  'fragment': frag,
 647                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 648                  'themes': frag.tags.filter(category='theme')
 649                  }
 650             m.update(f[OTHER])
 651             hits.append(m)
 652
 653         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 654
 655         self.hits = hits
 656
 657         return self
 658
 659     def __unicode__(self):
 660         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 661
 662     @staticmethod
 663     def aggregate(*result_lists):
 664         books = {}
 665         for rl in result_lists:
 666             for r in rl:
 667                 if r.book_id in books:
 668                     books[r.book_id].merge(r)
 669                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 670                 else:
 671                     books[r.book_id] = r
 672         return books.values()
 673
 674     def __cmp__(self, other):
 675         return cmp(self.score, other.score)
 676
 677
 678 class Hint(object):
 679     """
 680     Given some hint information (information we already know about)
 681     our search target - like author, title (specific book), epoch, genre, kind
 682     we can narrow down search using filters.
 683     """
 684     def __init__(self, search):
 685         """
 686         Accepts a Searcher instance.
 687         """
 688         self.search = search
 689         self.book_tags = {}
 690         self.part_tags = []
 691         self._books = []
 692
 693     def books(self, *books):
 694         """
 695         Give a hint that we search these books.
 696         """
 697         self._books = books
 698
 699     def tags(self, tags):
 700         """
 701         Give a hint that these Tag objects (a list of)
 702         is necessary.
 703         """
 704         for t in tags:
 705             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 706                 lst = self.book_tags.get(t.category, [])
 707                 lst.append(t)
 708                 self.book_tags[t.category] = lst
 709             if t.category in ['theme', 'theme_pl']:
 710                 self.part_tags.append(t)
 711
 712     def tag_filter(self, tags, field='tags'):
 713         """
 714         Given a lsit of tags and an optional field (but they are normally in tags field)
 715         returns a filter accepting only books with specific tags.
 716         """
 717         q = BooleanQuery()
 718
 719         for tag in tags:
 720             toks = self.search.get_tokens(tag.name, field=field)
 721             tag_phrase = PhraseQuery()
 722             for tok in toks:
 723                 tag_phrase.add(Term(field, tok))
 724             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 725
 726         return QueryWrapperFilter(q)
 727
 728     def book_filter(self):
 729         """
 730         Filters using book tags (all tag kinds except a theme)
 731         """
 732         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 733         if tags:
 734             return self.tag_filter(tags)
 735         else:
 736             return None
 737
 738     def part_filter(self):
 739         """
 740         This filter can be used to look for book parts.
 741         It filters on book id and/or themes.
 742         """
 743         fs = []
 744         if self.part_tags:
 745             fs.append(self.tag_filter(self.part_tags, field='themes'))
 746
 747         if self._books != []:
 748             bf = BooleanFilter()
 749             for b in self._books:
 750                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 751                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 752             fs.append(bf)
 753
 754         return Search.chain_filters(fs)
 755
 756     def should_search_for_book(self):
 757         return self._books == []
 758
 759     def just_search_in(self, all):
 760         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 761         some = []
 762         for field in all:
 763             if field == 'authors' and 'author' in self.book_tags:
 764                 continue
 765             if field == 'title' and self._books != []:
 766                 continue
 767             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 768                 continue
 769             some.append(field)
 770         return some
 771
 772
 773 class Search(IndexStore):
 774     """
 775     Search facilities.
 776     """
 777     def __init__(self, default_field="content"):
 778         IndexStore.__init__(self)
 779         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 780         # self.analyzer = WLAnalyzer()
 781         self.searcher = IndexSearcher(self.store, True)
 782         self.parser = QueryParser(Version.LUCENE_34, default_field,
 783                                   self.analyzer)
 784
 785         self.parent_filter = TermsFilter()
 786         self.parent_filter.addTerm(Term("is_book", "true"))
 787
 788     def query(self, query):
 789         """Parse query in default Lucene Syntax. (for humans)
 790         """
 791         return self.parser.parse(query)
 792
 793     def simple_search(self, query, max_results=50):
 794         """Runs a query for books using lucene syntax. (for humans)
 795         Returns (books, total_hits)
 796         """
 797
 798         tops = self.searcher.search(self.query(query), max_results)
 799         bks = []
 800         for found in tops.scoreDocs:
 801             doc = self.searcher.doc(found.doc)
 802             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 803         return (bks, tops.totalHits)
 804
 805     def get_tokens(self, searched, field='content'):
 806         """returns tokens analyzed by a proper (for a field) analyzer
 807         argument can be: StringReader, string/unicode, or tokens. In the last case
 808         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 809         """
 810         if isinstance(searched, str) or isinstance(searched, unicode):
 811             searched = StringReader(searched)
 812         elif isinstance(searched, list):
 813             return searched
 814
 815         searched.reset()
 816         tokens = self.analyzer.reusableTokenStream(field, searched)
 817         toks = []
 818         while tokens.incrementToken():
 819             cta = tokens.getAttribute(CharTermAttribute.class_)
 820             toks.append(cta.toString())
 821         return toks
 822
 823     def fuzziness(self, fuzzy):
 824         """Helper method to sanitize fuzziness"""
 825         if not fuzzy:
 826             return None
 827         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 828             return fuzzy
 829         else:
 830             return 0.5
 831
 832     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 833         """
 834         Return a PhraseQuery with a series of tokens.
 835         """
 836         if fuzzy:
 837             phrase = MultiPhraseQuery()
 838             for t in tokens:
 839                 term = Term(field, t)
 840                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 841                 fuzzterms = []
 842
 843                 while True:
 844                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 845                     ft = fuzzterm.term()
 846                     if ft:
 847                         fuzzterms.append(ft)
 848                     if not fuzzterm.next(): break
 849                 if fuzzterms:
 850                     phrase.add(JArray('object')(fuzzterms, Term))
 851                 else:
 852                     phrase.add(term)
 853         else:
 854             phrase = PhraseQuery()
 855             phrase.setSlop(slop)
 856             for t in tokens:
 857                 term = Term(field, t)
 858                 phrase.add(term)
 859         return phrase
 860
 861     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 862         """
 863         Returns term queries joined by boolean query.
 864         modal - applies to boolean query
 865         fuzzy - should the query by fuzzy.
 866         """
 867         q = BooleanQuery()
 868         for t in tokens:
 869             term = Term(field, t)
 870             if fuzzy:
 871                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 872             else:
 873                 term = TermQuery(term)
 874             q.add(BooleanClause(term, modal))
 875         return q
 876
 877     # def content_query(self, query):
 878     #     return BlockJoinQuery(query, self.parent_filter,
 879     #                           BlockJoinQuery.ScoreMode.Total)
 880
 881     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 882         """
 883         Search for perfect book matches. Just see if the query matches with some author or title,
 884         taking hints into account.
 885         """
 886         fields_to_search = ['authors', 'title']
 887         only_in = None
 888         if hint:
 889             if not hint.should_search_for_book():
 890                 return []
 891             fields_to_search = hint.just_search_in(fields_to_search)
 892             only_in = hint.book_filter()
 893
 894         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 895
 896         books = []
 897         for q in qrys:
 898             top = self.searcher.search(q,
 899                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 900                 max_results)
 901             for found in top.scoreDocs:
 902                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
 903         return books
 904
 905     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 906         fields_to_search = ['tags', 'authors', 'title']
 907
 908         only_in = None
 909         if hint:
 910             if not hint.should_search_for_book():
 911                 return []
 912             fields_to_search = hint.just_search_in(fields_to_search)
 913             only_in = hint.book_filter()
 914
 915         tokens = self.get_tokens(searched, field='SIMPLE')
 916
 917         q = BooleanQuery()
 918
 919         for fld in fields_to_search:
 920             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 921                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 922
 923         books = []
 924         top = self.searcher.search(q,
 925                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 926             max_results)
 927         for found in top.scoreDocs:
 928             books.append(SearchResult(self.searcher, found, how_found="search_book"))
 929
 930         return books
 931
 932     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 933         """
 934         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
 935         some part/fragment of the book.
 936         """
 937         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 938
 939         flt = None
 940         if hint:
 941             flt = hint.part_filter()
 942
 943         books = []
 944         for q in qrys:
 945             top = self.searcher.search(q,
 946                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 947                                                            flt]),
 948                                        max_results)
 949             for found in top.scoreDocs:
 950                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 951
 952         return books
 953
 954     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
 955         """
 956         Tries to use search terms to match different fields of book (or its parts).
 957         E.g. one word can be an author survey, another be a part of the title, and the rest
 958         are some words from third chapter.
 959         """
 960         books = []
 961         only_in = None
 962
 963         if hint:
 964             only_in = hint.part_filter()
 965
 966         # content only query : themes x content
 967         q = BooleanQuery()
 968
 969         tokens_pl = self.get_tokens(searched, field='content')
 970         tokens = self.get_tokens(searched, field='SIMPLE')
 971
 972         # only search in themes when we do not already filter by themes
 973         if hint is None or hint.just_search_in(['themes']) != []:
 974             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 975                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 976
 977         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 978                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 979
 980         topDocs = self.searcher.search(q, only_in, max_results)
 981         for found in topDocs.scoreDocs:
 982             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
 983             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 984
 985         # query themes/content x author/title/tags
 986         q = BooleanQuery()
 987         in_content = BooleanQuery()
 988         in_meta = BooleanQuery()
 989
 990         for fld in ['themes_pl', 'content']:
 991             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 992
 993         for fld in ['tags', 'authors', 'title']:
 994             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 995
 996         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 997         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 998
 999         topDocs = self.searcher.search(q, only_in, max_results)
1000         for found in topDocs.scoreDocs:
1001             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
1002             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1003
1004         return books
1005
1006     # def multisearch(self, query, max_results=50):
1007     #     """
1008     #     Search strategy:
1009     #     - (phrase) OR -> content
1010     #                   -> title
1011     #                   -> authors
1012     #     - (keywords)  -> authors
1013     #                   -> motyw
1014     #                   -> tags
1015     #                   -> content
1016     #     """
1017         # queryreader = StringReader(query)
1018         # tokens = self.get_tokens(queryreader)
1019
1020         # top_level = BooleanQuery()
1021         # Should = BooleanClause.Occur.SHOULD
1022
1023         # phrase_level = BooleanQuery()
1024         # phrase_level.setBoost(1.3)
1025
1026         # p_content = self.make_phrase(tokens, joined=True)
1027         # p_title = self.make_phrase(tokens, 'title')
1028         # p_author = self.make_phrase(tokens, 'author')
1029
1030         # phrase_level.add(BooleanClause(p_content, Should))
1031         # phrase_level.add(BooleanClause(p_title, Should))
1032         # phrase_level.add(BooleanClause(p_author, Should))
1033
1034         # kw_level = BooleanQuery()
1035
1036         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1037         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1038         # kw_level.add(j_themes, Should)
1039         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1040         # j_con = self.make_term_query(tokens, joined=True)
1041         # kw_level.add(j_con, Should)
1042
1043         # top_level.add(BooleanClause(phrase_level, Should))
1044         # top_level.add(BooleanClause(kw_level, Should))
1045
1046         # return None
1047
1048     def get_snippets(self, scoreDoc, query, field='content'):
1049         """
1050         Returns a snippet for found scoreDoc.
1051         """
1052         htmlFormatter = SimpleHTMLFormatter()
1053         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1054
1055         stored = self.searcher.doc(scoreDoc.doc)
1056
1057         # locate content.
1058         snippets = Snippets(stored.get('book_id')).open()
1059         try:
1060             text = snippets.get((int(stored.get('snippets_position')),
1061                                  int(stored.get('snippets_length'))))
1062         finally:
1063             snippets.close()
1064
1065         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1066         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1067         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1068
1069         return snip
1070
1071     @staticmethod
1072     def enum_to_array(enum):
1073         """
1074         Converts a lucene TermEnum to array of Terms, suitable for
1075         addition to queries
1076         """
1077         terms = []
1078
1079         while True:
1080             t = enum.term()
1081             if t:
1082                 terms.append(t)
1083             if not enum.next(): break
1084
1085         if terms:
1086             return JArray('object')(terms, Term)
1087
1088     def search_tags(self, query, filter=None, max_results=40):
1089         """
1090         Search for Tag objects using query.
1091         """
1092         tops = self.searcher.search(query, filter, max_results)
1093
1094         tags = []
1095         for found in tops.scoreDocs:
1096             doc = self.searcher.doc(found.doc)
1097             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1098             tags.append(tag)
1099             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1100
1101         return tags
1102
1103     def search_books(self, query, filter=None, max_results=10):
1104         """
1105         Searches for Book objects using query
1106         """
1107         bks = []
1108         tops = self.searcher.search(query, filter, max_results)
1109         for found in tops.scoreDocs:
1110             doc = self.searcher.doc(found.doc)
1111             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1112         return bks
1113
1114     def create_prefix_phrase(self, toks, field):
1115         q = MultiPhraseQuery()
1116         for i in range(len(toks)):
1117             t = Term(field, toks[i])
1118             if i == len(toks) - 1:
1119                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1120                 if pterms:
1121                     q.add(pterms)
1122                 else:
1123                     q.add(t)
1124             else:
1125                 q.add(t)
1126         return q
1127
1128     @staticmethod
1129     def term_filter(term, inverse=False):
1130         only_term = TermsFilter()
1131         only_term.addTerm(term)
1132
1133         if inverse:
1134             neg = BooleanFilter()
1135             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1136             only_term = neg
1137
1138         return only_term
1139
1140     def hint_tags(self, string, max_results=50):
1141         """
1142         Return auto-complete hints for tags
1143         using prefix search.
1144         """
1145         toks = self.get_tokens(string, field='SIMPLE')
1146         top = BooleanQuery()
1147
1148         for field in ['tag_name', 'tag_name_pl']:
1149             q = self.create_prefix_phrase(toks, field)
1150             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1151
1152         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1153
1154         return self.search_tags(top, no_book_cat, max_results=max_results)
1155
1156     def hint_books(self, string, max_results=50):
1157         """
1158         Returns auto-complete hints for book titles
1159         Because we do not index 'pseudo' title-tags.
1160         Prefix search.
1161         """
1162         toks = self.get_tokens(string, field='SIMPLE')
1163
1164         q = self.create_prefix_phrase(toks, 'title')
1165
1166         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1167
1168     @staticmethod
1169     def chain_filters(filters, op=ChainedFilter.AND):
1170         """
1171         Chains a filter list together
1172         """
1173         filters = filter(lambda x: x is not None, filters)
1174         if not filters:
1175             return None
1176         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1177         return chf
1178
1179     def filtered_categories(self, tags):
1180         """
1181         Return a list of tag categories, present in tags list.
1182         """
1183         cats = {}
1184         for t in tags:
1185             cats[t.category] = True
1186         return cats.keys()
1187
1188     def hint(self):
1189         return Hint(self)