apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 import catalogue.models
  29 from multiprocessing.pool import ThreadPool
  30 from threading import current_thread
  31 import atexit
  32 import traceback
  33
  34
  35 class WLAnalyzer(PerFieldAnalyzerWrapper):
  36     def __init__(self):
  37         polish = PolishAnalyzer(Version.LUCENE_34)
  38         #        polish_gap.setPositionIncrementGap(999)
  39
  40         simple = SimpleAnalyzer(Version.LUCENE_34)
  41         #        simple_gap.setPositionIncrementGap(999)
  42
  43         keyword = KeywordAnalyzer(Version.LUCENE_34)
  44
  45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  46
  47         PerFieldAnalyzerWrapper.__init__(self, polish)
  48
  49         self.addAnalyzer("tags", simple)
  50         self.addAnalyzer("technical_editors", simple)
  51         self.addAnalyzer("editors", simple)
  52         self.addAnalyzer("url", keyword)
  53         self.addAnalyzer("source_url", keyword)
  54         self.addAnalyzer("source_name", simple)
  55         self.addAnalyzer("publisher", simple)
  56         self.addAnalyzer("authors", simple)
  57         self.addAnalyzer("title", simple)
  58
  59         self.addAnalyzer("is_book", keyword)
  60         # shouldn't the title have two forms? _pl and simple?
  61
  62         self.addAnalyzer("themes", simple)
  63         self.addAnalyzer("themes_pl", polish)
  64
  65         self.addAnalyzer("tag_name", simple)
  66         self.addAnalyzer("tag_name_pl", polish)
  67
  68         self.addAnalyzer("translators", simple)
  69
  70         self.addAnalyzer("KEYWORD", keyword)
  71         self.addAnalyzer("SIMPLE", simple)
  72         self.addAnalyzer("POLISH", polish)
  73
  74
  75 class IndexStore(object):
  76     """
  77     Provides access to search index.
  78
  79     self.store - lucene index directory
  80     """
  81     def __init__(self):
  82         self.make_index_dir()
  83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  84
  85     def make_index_dir(self):
  86         try:
  87             os.makedirs(settings.SEARCH_INDEX)
  88         except OSError as exc:
  89             if exc.errno == errno.EEXIST:
  90                 pass
  91             else: raise
  92
  93
  94 class IndexChecker(IndexStore):
  95     def __init__(self):
  96         IndexStore.__init__(self)
  97
  98     def check(self):
  99         checker = CheckIndex(self.store)
 100         status = checker.checkIndex()
 101         return status
 102
 103
 104 class Snippets(object):
 105     """
 106     This class manages snippet files for indexed object (book)
 107     the snippets are concatenated together, and their positions and
 108     lengths are kept in lucene index fields.
 109     """
 110     SNIPPET_DIR = "snippets"
 111
 112     def __init__(self, book_id):
 113         try:
 114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 115         except OSError as exc:
 116             if exc.errno == errno.EEXIST:
 117                 pass
 118             else: raise
 119         self.book_id = book_id
 120         self.file = None
 121
 122     def open(self, mode='r'):
 123         """
 124         Open the snippet file. Call .close() afterwards.
 125         """
 126         if not 'b' in mode:
 127             mode += 'b'
 128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 129         self.position = 0
 130         return self
 131
 132     def add(self, snippet):
 133         """
 134         Append a snippet (unicode) to the snippet file.
 135         Return a (position, length) tuple
 136         """
 137         txt = snippet.encode('utf-8')
 138         l = len(txt)
 139         self.file.write(txt)
 140         pos = (self.position, l)
 141         self.position += l
 142         return pos
 143
 144     def get(self, pos):
 145         """
 146         Given a tuple of (position, length) return an unicode
 147         of the snippet stored there.
 148         """
 149         self.file.seek(pos[0], 0)
 150         txt = self.file.read(pos[1]).decode('utf-8')
 151         return txt
 152
 153     def close(self):
 154         """Close snippet file"""
 155         self.file.close()
 156
 157
 158 class BaseIndex(IndexStore):
 159     """
 160     Base index class.
 161     Provides basic operations on index: opening, closing, optimizing.
 162     """
 163     def __init__(self, analyzer=None):
 164         super(BaseIndex, self).__init__()
 165         self.index = None
 166         if not analyzer:
 167             analyzer = WLAnalyzer()
 168         self.analyzer = analyzer
 169
 170     def open(self, analyzer=None):
 171         if self.index:
 172             raise Exception("Index is already opened")
 173         self.index = IndexWriter(self.store, self.analyzer,\
 174                                  IndexWriter.MaxFieldLength.LIMITED)
 175         return self.index
 176
 177     def optimize(self):
 178         self.index.optimize()
 179
 180     def close(self):
 181         try:
 182             self.index.optimize()
 183         except JavaError, je:
 184             print "Error during optimize phase, check index: %s" % je
 185
 186         self.index.close()
 187         self.index = None
 188
 189     def __enter__(self):
 190         self.open()
 191         return self
 192
 193     def __exit__(self, type, value, tb):
 194         self.close()
 195
 196
 197 class Index(BaseIndex):
 198     """
 199     Class indexing books.
 200     """
 201     def __init__(self, analyzer=None):
 202         super(Index, self).__init__(analyzer)
 203
 204     def index_tags(self):
 205         """
 206         Re-index global tag list.
 207         Removes all tags from index, then index them again.
 208         Indexed fields include: id, name (with and without polish stems), category
 209         """
 210         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 211         self.index.deleteDocuments(q)
 212
 213         for tag in catalogue.models.Tag.objects.all():
 214             doc = Document()
 215             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 216             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 217             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 219             self.index.addDocument(doc)
 220
 221     def create_book_doc(self, book):
 222         """
 223         Create a lucene document referring book id.
 224         """
 225         doc = Document()
 226         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 227         if book.parent is not None:
 228             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 229         return doc
 230
 231     def remove_book(self, book):
 232         """Removes a book from search index.
 233         book - Book instance."""
 234         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 235         self.index.deleteDocuments(q)
 236
 237     def index_book(self, book, book_info=None, overwrite=True):
 238         """
 239         Indexes the book.
 240         Creates a lucene document for extracted metadata
 241         and calls self.index_content() to index the contents of the book.
 242         """
 243         if overwrite:
 244             self.remove_book(book)
 245
 246         book_doc = self.create_book_doc(book)
 247         meta_fields = self.extract_metadata(book, book_info)
 248         for f in meta_fields.values():
 249             if isinstance(f, list) or isinstance(f, tuple):
 250                 for elem in f:
 251                     book_doc.add(elem)
 252             else:
 253                 book_doc.add(f)
 254
 255         self.index.addDocument(book_doc)
 256         del book_doc
 257
 258         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 259
 260     master_tags = [
 261         'opowiadanie',
 262         'powiesc',
 263         'dramat_wierszowany_l',
 264         'dramat_wierszowany_lp',
 265         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 266         'wywiad'
 267         ]
 268
 269     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 270
 271     def extract_metadata(self, book, book_info=None):
 272         """
 273         Extract metadata from book and returns a map of fields keyed by fieldname
 274         """
 275         fields = {}
 276
 277         if book_info is None:
 278             book_info = dcparser.parse(open(book.xml_file.path))
 279
 280         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 281         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 282         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 283
 284         # validator, name
 285         for field in dcparser.BookInfo.FIELDS:
 286             if hasattr(book_info, field.name):
 287                 if not getattr(book_info, field.name):
 288                     continue
 289                 # since no type information is available, we use validator
 290                 type_indicator = field.validator
 291                 if type_indicator == dcparser.as_unicode:
 292                     s = getattr(book_info, field.name)
 293                     if field.multiple:
 294                         s = ', '.join(s)
 295                     try:
 296                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 297                     except JavaError as je:
 298                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 299                 elif type_indicator == dcparser.as_person:
 300                     p = getattr(book_info, field.name)
 301                     if isinstance(p, dcparser.Person):
 302                         persons = unicode(p)
 303                     else:
 304                         persons = ', '.join(map(unicode, p))
 305                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 306                 elif type_indicator == dcparser.as_date:
 307                     dt = getattr(book_info, field.name)
 308                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 309                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 310
 311         return fields
 312
 313     def add_gaps(self, fields, fieldname):
 314         """
 315         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 316         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 317         """
 318         def gap():
 319             while True:
 320                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 321         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 322
 323     def get_master(self, root):
 324         """
 325         Returns the first master tag from an etree.
 326         """
 327         for master in root.iter():
 328             if master.tag in self.master_tags:
 329                 return master
 330
 331     def index_content(self, book, book_fields=[]):
 332         """
 333         Walks the book XML and extract content from it.
 334         Adds parts for each header tag and for each fragment.
 335         """
 336         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 337         root = wld.edoc.getroot()
 338
 339         master = self.get_master(root)
 340         if master is None:
 341             return []
 342
 343         def walker(node):
 344             yield node, None
 345             for child in list(node):
 346                 for b, e in walker(child):
 347                     yield b, e
 348             yield None, node
 349             return
 350
 351         def fix_format(text):
 352             return re.sub("(?m)/$", "", text)
 353
 354         def add_part(snippets, **fields):
 355             doc = self.create_book_doc(book)
 356             for f in book_fields:
 357                 doc.add(f)
 358
 359             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 360             doc.add(NumericField("header_span", Field.Store.YES, True)\
 361                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 362             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 363
 364             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 365                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 366
 367             snip_pos = snippets.add(fields["content"])
 368             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 369             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 370
 371             if 'fragment_anchor' in fields:
 372                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 373                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 374
 375             if 'themes' in fields:
 376                 themes, themes_pl = zip(*[
 377                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 378                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 379                      for theme in fields['themes']])
 380
 381                 themes = self.add_gaps(themes, 'themes')
 382                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 383
 384                 for t in themes:
 385                     doc.add(t)
 386                 for t in themes_pl:
 387                     doc.add(t)
 388
 389             return doc
 390
 391         def give_me_utf8(s):
 392             if isinstance(s, unicode):
 393                 return s.encode('utf-8')
 394             else:
 395                 return s
 396
 397         fragments = {}
 398         snippets = Snippets(book.id).open('w')
 399         try:
 400             for header, position in zip(list(master), range(len(master))):
 401
 402                 if header.tag in self.skip_header_tags:
 403                     continue
 404
 405                 # section content
 406                 content = []
 407
 408                 for start, end in walker(header):
 409                         # handle fragments and themes.
 410                     if start is not None and start.tag == 'begin':
 411                         fid = start.attrib['id'][1:]
 412                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 413
 414                     elif start is not None and start.tag == 'motyw':
 415                         fid = start.attrib['id'][1:]
 416                         if start.text is not None:
 417                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 418
 419                     elif start is not None and start.tag == 'end':
 420                         fid = start.attrib['id'][1:]
 421                         if fid not in fragments:
 422                             continue  # a broken <end> node, skip it
 423                                       #                        import pdb; pdb.set_trace()
 424                         frag = fragments[fid]
 425                         if frag['themes'] == []:
 426                             continue  # empty themes list.
 427                         del fragments[fid]
 428
 429                         def jstr(l):
 430                             return u' '.join(map(
 431                                 lambda x: x == None and u'(none)' or unicode(x),
 432                                 l))
 433
 434                         doc = add_part(snippets,
 435                                        header_type=frag['start_header'],
 436                                        header_index=frag['start_section'],
 437                                        header_span=position - frag['start_section'] + 1,
 438                                        fragment_anchor=fid,
 439                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 440                                        themes=frag['themes'])
 441
 442                         self.index.addDocument(doc)
 443
 444                         # Collect content.
 445                     elif start is not None:
 446                         for frag in fragments.values():
 447                             frag['content'].append(start.text)
 448                         content.append(start.text)
 449                     elif end is not None:
 450                         for frag in fragments.values():
 451                             frag['content'].append(end.tail)
 452                         content.append(end.tail)
 453
 454                         # in the end, add a section text.
 455                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 456                                content=fix_format(u' '.join(filter(lambda s: s is not None, content))))
 457
 458                 self.index.addDocument(doc)
 459
 460         finally:
 461             snippets.close()
 462
 463
 464 def log_exception_wrapper(f):
 465     def _wrap(*a):
 466         try:
 467             f(*a)
 468         except Exception, e:
 469             print("Error in indexing thread: %s" % e)
 470             traceback.print_exc()
 471             raise e
 472     return _wrap
 473
 474
 475 class ReusableIndex(Index):
 476     """
 477     Works like index, but does not close/optimize Lucene index
 478     until program exit (uses atexit hook).
 479     This is usefull for importbooks command.
 480
 481     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 482     """
 483     index = None
 484
 485     def open(self, analyzer=None, threads=4):
 486         if ReusableIndex.index is not None:
 487             self.index = ReusableIndex.index
 488         else:
 489             print("opening index")
 490             Index.open(self, analyzer)
 491             ReusableIndex.index = self.index
 492             atexit.register(ReusableIndex.close_reusable)
 493
 494     # def index_book(self, *args, **kw):
 495     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 496     #     ReusableIndex.pool_jobs.append(job)
 497
 498     @staticmethod
 499     def close_reusable():
 500         if ReusableIndex.index is not None:
 501             ReusableIndex.index.optimize()
 502             ReusableIndex.index.close()
 503             ReusableIndex.index = None
 504
 505     def close(self):
 506         pass
 507
 508
 509 class JoinSearch(object):
 510     """
 511     This mixin could be used to handle block join queries.
 512     (currently unused)
 513     """
 514     def __init__(self, *args, **kw):
 515         super(JoinSearch, self).__init__(*args, **kw)
 516
 517     def wrapjoins(self, query, fields=[]):
 518         """
 519         This functions modifies the query in a recursive way,
 520         so Term and Phrase Queries contained, which match
 521         provided fields are wrapped in a BlockJoinQuery,
 522         and so delegated to children documents.
 523         """
 524         if BooleanQuery.instance_(query):
 525             qs = BooleanQuery.cast_(query)
 526             for clause in qs:
 527                 clause = BooleanClause.cast_(clause)
 528                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 529             return qs
 530         else:
 531             termset = HashSet()
 532             query.extractTerms(termset)
 533             for t in termset:
 534                 t = Term.cast_(t)
 535                 if t.field() not in fields:
 536                     return query
 537             return BlockJoinQuery(query, self.parent_filter,
 538                                   BlockJoinQuery.ScoreMode.Total)
 539
 540     def bsearch(self, query, max_results=50):
 541         q = self.query(query)
 542         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 543
 544         tops = self.searcher.search(bjq, max_results)
 545         bks = []
 546         for found in tops.scoreDocs:
 547             doc = self.searcher.doc(found.doc)
 548             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 549         return (bks, tops.totalHits)
 550
 551
 552 class SearchResult(object):
 553     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
 554         if score:
 555             self.score = score
 556         else:
 557             self.score = scoreDocs.score
 558
 559         self._hits = []
 560         self.hits = None  # processed hits
 561
 562         stored = searcher.doc(scoreDocs.doc)
 563         self.book_id = int(stored.get("book_id"))
 564
 565         header_type = stored.get("header_type")
 566         if not header_type:
 567             return
 568
 569         sec = (header_type, int(stored.get("header_index")))
 570         header_span = stored.get('header_span')
 571         header_span = header_span is not None and int(header_span) or 1
 572
 573         fragment = stored.get("fragment_anchor")
 574
 575         if snippets:
 576             snippets = snippets.replace("/\n", "\n")
 577         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 578
 579         self._hits.append(hit)
 580
 581     def merge(self, other):
 582         if self.book_id != other.book_id:
 583             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 584         self._hits += other._hits
 585         if other.score > self.score:
 586             self.score = other.score
 587         return self
 588
 589     def get_book(self):
 590         return catalogue.models.Book.objects.get(id=self.book_id)
 591
 592     book = property(get_book)
 593
 594     def process_hits(self):
 595         POSITION = 0
 596         FRAGMENT = 1
 597         POSITION_INDEX = 1
 598         POSITION_SPAN = 2
 599         SCORE = 2
 600         OTHER = 3
 601
 602         # to sections and fragments
 603         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 604         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 605         sect = filter(lambda s: 0 == len(filter(
 606             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 607             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 608             frags)), sect)
 609
 610         hits = []
 611
 612         # remove duplicate fragments
 613         fragments = {}
 614         for f in frags:
 615             fid = f[FRAGMENT]
 616             if fid in fragments:
 617                 if fragments[fid][SCORE] >= f[SCORE]:
 618                     continue
 619             fragments[fid] = f
 620         frags = fragments.values()
 621
 622         # remove duplicate sections
 623         sections = {}
 624
 625         for s in sect:
 626             si = s[POSITION][POSITION_INDEX]
 627             # skip existing
 628             if si in sections:
 629                 if sections[si]['score'] >= s[SCORE]:
 630                     continue
 631
 632             m = {'score': s[SCORE],
 633                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 634                  }
 635             m.update(s[OTHER])
 636             sections[si] = m
 637
 638         hits = sections.values()
 639
 640         for f in frags:
 641             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 642             m = {'score': f[SCORE],
 643                  'fragment': frag,
 644                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 645                  'themes': frag.tags.filter(category='theme')
 646                  }
 647             m.update(f[OTHER])
 648             hits.append(m)
 649
 650         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 651
 652         self.hits = hits
 653
 654         return self
 655
 656     def __unicode__(self):
 657         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 658
 659     @staticmethod
 660     def aggregate(*result_lists):
 661         books = {}
 662         for rl in result_lists:
 663             for r in rl:
 664                 if r.book_id in books:
 665                     books[r.book_id].merge(r)
 666                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 667                 else:
 668                     books[r.book_id] = r
 669         return books.values()
 670
 671     def __cmp__(self, other):
 672         return cmp(self.score, other.score)
 673
 674
 675 class Hint(object):
 676     """
 677     Given some hint information (information we already know about)
 678     our search target - like author, title (specific book), epoch, genre, kind
 679     we can narrow down search using filters.
 680     """
 681     def __init__(self, search):
 682         """
 683         Accepts a Searcher instance.
 684         """
 685         self.search = search
 686         self.book_tags = {}
 687         self.part_tags = []
 688         self._books = []
 689
 690     def books(self, *books):
 691         """
 692         Give a hint that we search these books.
 693         """
 694         self._books = books
 695
 696     def tags(self, tags):
 697         """
 698         Give a hint that these Tag objects (a list of)
 699         is necessary.
 700         """
 701         for t in tags:
 702             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 703                 lst = self.book_tags.get(t.category, [])
 704                 lst.append(t)
 705                 self.book_tags[t.category] = lst
 706             if t.category in ['theme', 'theme_pl']:
 707                 self.part_tags.append(t)
 708
 709     def tag_filter(self, tags, field='tags'):
 710         """
 711         Given a lsit of tags and an optional field (but they are normally in tags field)
 712         returns a filter accepting only books with specific tags.
 713         """
 714         q = BooleanQuery()
 715
 716         for tag in tags:
 717             toks = self.search.get_tokens(tag.name, field=field)
 718             tag_phrase = PhraseQuery()
 719             for tok in toks:
 720                 tag_phrase.add(Term(field, tok))
 721             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 722
 723         return QueryWrapperFilter(q)
 724
 725     def book_filter(self):
 726         """
 727         Filters using book tags (all tag kinds except a theme)
 728         """
 729         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 730         if tags:
 731             return self.tag_filter(tags)
 732         else:
 733             return None
 734
 735     def part_filter(self):
 736         """
 737         This filter can be used to look for book parts.
 738         It filters on book id and/or themes.
 739         """
 740         fs = []
 741         if self.part_tags:
 742             fs.append(self.tag_filter(self.part_tags, field='themes'))
 743
 744         if self._books != []:
 745             bf = BooleanFilter()
 746             for b in self._books:
 747                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 748                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 749             fs.append(bf)
 750
 751         return Search.chain_filters(fs)
 752
 753     def should_search_for_book(self):
 754         return self._books == []
 755
 756     def just_search_in(self, all):
 757         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 758         some = []
 759         for field in all:
 760             if field == 'authors' and 'author' in self.book_tags:
 761                 continue
 762             if field == 'title' and self._books != []:
 763                 continue
 764             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 765                 continue
 766             some.append(field)
 767         return some
 768
 769
 770 class Search(IndexStore):
 771     """
 772     Search facilities.
 773     """
 774     def __init__(self, default_field="content"):
 775         IndexStore.__init__(self)
 776         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 777         # self.analyzer = WLAnalyzer()
 778         self.searcher = IndexSearcher(self.store, True)
 779         self.parser = QueryParser(Version.LUCENE_34, default_field,
 780                                   self.analyzer)
 781
 782         self.parent_filter = TermsFilter()
 783         self.parent_filter.addTerm(Term("is_book", "true"))
 784
 785     def query(self, query):
 786         """Parse query in default Lucene Syntax. (for humans)
 787         """
 788         return self.parser.parse(query)
 789
 790     def simple_search(self, query, max_results=50):
 791         """Runs a query for books using lucene syntax. (for humans)
 792         Returns (books, total_hits)
 793         """
 794
 795         tops = self.searcher.search(self.query(query), max_results)
 796         bks = []
 797         for found in tops.scoreDocs:
 798             doc = self.searcher.doc(found.doc)
 799             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 800         return (bks, tops.totalHits)
 801
 802     def get_tokens(self, searched, field='content'):
 803         """returns tokens analyzed by a proper (for a field) analyzer
 804         argument can be: StringReader, string/unicode, or tokens. In the last case
 805         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 806         """
 807         if isinstance(searched, str) or isinstance(searched, unicode):
 808             searched = StringReader(searched)
 809         elif isinstance(searched, list):
 810             return searched
 811
 812         searched.reset()
 813         tokens = self.analyzer.reusableTokenStream(field, searched)
 814         toks = []
 815         while tokens.incrementToken():
 816             cta = tokens.getAttribute(CharTermAttribute.class_)
 817             toks.append(cta.toString())
 818         return toks
 819
 820     def fuzziness(self, fuzzy):
 821         """Helper method to sanitize fuzziness"""
 822         if not fuzzy:
 823             return None
 824         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 825             return fuzzy
 826         else:
 827             return 0.5
 828
 829     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 830         """
 831         Return a PhraseQuery with a series of tokens.
 832         """
 833         if fuzzy:
 834             phrase = MultiPhraseQuery()
 835             for t in tokens:
 836                 term = Term(field, t)
 837                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 838                 fuzzterms = []
 839
 840                 while True:
 841                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 842                     ft = fuzzterm.term()
 843                     if ft:
 844                         fuzzterms.append(ft)
 845                     if not fuzzterm.next(): break
 846                 if fuzzterms:
 847                     phrase.add(JArray('object')(fuzzterms, Term))
 848                 else:
 849                     phrase.add(term)
 850         else:
 851             phrase = PhraseQuery()
 852             phrase.setSlop(slop)
 853             for t in tokens:
 854                 term = Term(field, t)
 855                 phrase.add(term)
 856         return phrase
 857
 858     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 859         """
 860         Returns term queries joined by boolean query.
 861         modal - applies to boolean query
 862         fuzzy - should the query by fuzzy.
 863         """
 864         q = BooleanQuery()
 865         for t in tokens:
 866             term = Term(field, t)
 867             if fuzzy:
 868                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 869             else:
 870                 term = TermQuery(term)
 871             q.add(BooleanClause(term, modal))
 872         return q
 873
 874     # def content_query(self, query):
 875     #     return BlockJoinQuery(query, self.parent_filter,
 876     #                           BlockJoinQuery.ScoreMode.Total)
 877
 878     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 879         """
 880         Search for perfect book matches. Just see if the query matches with some author or title,
 881         taking hints into account.
 882         """
 883         fields_to_search = ['authors', 'title']
 884         only_in = None
 885         if hint:
 886             if not hint.should_search_for_book():
 887                 return []
 888             fields_to_search = hint.just_search_in(fields_to_search)
 889             only_in = hint.book_filter()
 890
 891         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 892
 893         books = []
 894         for q in qrys:
 895             top = self.searcher.search(q,
 896                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 897                 max_results)
 898             for found in top.scoreDocs:
 899                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
 900         return books
 901
 902     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 903         fields_to_search = ['tags', 'authors', 'title']
 904
 905         only_in = None
 906         if hint:
 907             if not hint.should_search_for_book():
 908                 return []
 909             fields_to_search = hint.just_search_in(fields_to_search)
 910             only_in = hint.book_filter()
 911
 912         tokens = self.get_tokens(searched, field='SIMPLE')
 913
 914         q = BooleanQuery()
 915
 916         for fld in fields_to_search:
 917             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 918                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 919
 920         books = []
 921         top = self.searcher.search(q,
 922                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 923             max_results)
 924         for found in top.scoreDocs:
 925             books.append(SearchResult(self.searcher, found, how_found="search_book"))
 926
 927         return books
 928
 929     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 930         """
 931         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
 932         some part/fragment of the book.
 933         """
 934         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 935
 936         flt = None
 937         if hint:
 938             flt = hint.part_filter()
 939
 940         books = []
 941         for q in qrys:
 942             top = self.searcher.search(q,
 943                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 944                                                            flt]),
 945                                        max_results)
 946             for found in top.scoreDocs:
 947                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 948
 949         return books
 950
 951     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
 952         """
 953         Tries to use search terms to match different fields of book (or its parts).
 954         E.g. one word can be an author survey, another be a part of the title, and the rest
 955         are some words from third chapter.
 956         """
 957         books = []
 958         only_in = None
 959
 960         if hint:
 961             only_in = hint.part_filter()
 962
 963         # content only query : themes x content
 964         q = BooleanQuery()
 965
 966         tokens_pl = self.get_tokens(searched, field='content')
 967         tokens = self.get_tokens(searched, field='SIMPLE')
 968
 969         # only search in themes when we do not already filter by themes
 970         if hint is None or hint.just_search_in(['themes']) != []:
 971             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 972                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 973
 974         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 975                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 976
 977         topDocs = self.searcher.search(q, only_in, max_results)
 978         for found in topDocs.scoreDocs:
 979             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
 980             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 981
 982         # query themes/content x author/title/tags
 983         q = BooleanQuery()
 984         in_content = BooleanQuery()
 985         in_meta = BooleanQuery()
 986
 987         for fld in ['themes_pl', 'content']:
 988             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 989
 990         for fld in ['tags', 'authors', 'title']:
 991             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 992
 993         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 994         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 995
 996         topDocs = self.searcher.search(q, only_in, max_results)
 997         for found in topDocs.scoreDocs:
 998             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
 999             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1000
1001         return books
1002
1003     # def multisearch(self, query, max_results=50):
1004     #     """
1005     #     Search strategy:
1006     #     - (phrase) OR -> content
1007     #                   -> title
1008     #                   -> authors
1009     #     - (keywords)  -> authors
1010     #                   -> motyw
1011     #                   -> tags
1012     #                   -> content
1013     #     """
1014         # queryreader = StringReader(query)
1015         # tokens = self.get_tokens(queryreader)
1016
1017         # top_level = BooleanQuery()
1018         # Should = BooleanClause.Occur.SHOULD
1019
1020         # phrase_level = BooleanQuery()
1021         # phrase_level.setBoost(1.3)
1022
1023         # p_content = self.make_phrase(tokens, joined=True)
1024         # p_title = self.make_phrase(tokens, 'title')
1025         # p_author = self.make_phrase(tokens, 'author')
1026
1027         # phrase_level.add(BooleanClause(p_content, Should))
1028         # phrase_level.add(BooleanClause(p_title, Should))
1029         # phrase_level.add(BooleanClause(p_author, Should))
1030
1031         # kw_level = BooleanQuery()
1032
1033         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1034         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1035         # kw_level.add(j_themes, Should)
1036         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1037         # j_con = self.make_term_query(tokens, joined=True)
1038         # kw_level.add(j_con, Should)
1039
1040         # top_level.add(BooleanClause(phrase_level, Should))
1041         # top_level.add(BooleanClause(kw_level, Should))
1042
1043         # return None
1044
1045     def get_snippets(self, scoreDoc, query, field='content'):
1046         """
1047         Returns a snippet for found scoreDoc.
1048         """
1049         htmlFormatter = SimpleHTMLFormatter()
1050         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1051
1052         stored = self.searcher.doc(scoreDoc.doc)
1053
1054         # locate content.
1055         snippets = Snippets(stored.get('book_id')).open()
1056         try:
1057             text = snippets.get((int(stored.get('snippets_position')),
1058                                  int(stored.get('snippets_length'))))
1059         finally:
1060             snippets.close()
1061
1062         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1063         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1064         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1065
1066         return snip
1067
1068     @staticmethod
1069     def enum_to_array(enum):
1070         """
1071         Converts a lucene TermEnum to array of Terms, suitable for
1072         addition to queries
1073         """
1074         terms = []
1075
1076         while True:
1077             t = enum.term()
1078             if t:
1079                 terms.append(t)
1080             if not enum.next(): break
1081
1082         if terms:
1083             return JArray('object')(terms, Term)
1084
1085     def search_tags(self, query, filter=None, max_results=40):
1086         """
1087         Search for Tag objects using query.
1088         """
1089         tops = self.searcher.search(query, filter, max_results)
1090
1091         tags = []
1092         for found in tops.scoreDocs:
1093             doc = self.searcher.doc(found.doc)
1094             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1095             tags.append(tag)
1096             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1097
1098         return tags
1099
1100     def search_books(self, query, filter=None, max_results=10):
1101         """
1102         Searches for Book objects using query
1103         """
1104         bks = []
1105         tops = self.searcher.search(query, filter, max_results)
1106         for found in tops.scoreDocs:
1107             doc = self.searcher.doc(found.doc)
1108             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1109         return bks
1110
1111     def create_prefix_phrase(self, toks, field):
1112         q = MultiPhraseQuery()
1113         for i in range(len(toks)):
1114             t = Term(field, toks[i])
1115             if i == len(toks) - 1:
1116                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1117                 if pterms:
1118                     q.add(pterms)
1119                 else:
1120                     q.add(t)
1121             else:
1122                 q.add(t)
1123         return q
1124
1125     @staticmethod
1126     def term_filter(term, inverse=False):
1127         only_term = TermsFilter()
1128         only_term.addTerm(term)
1129
1130         if inverse:
1131             neg = BooleanFilter()
1132             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1133             only_term = neg
1134
1135         return only_term
1136
1137     def hint_tags(self, string, max_results=50):
1138         """
1139         Return auto-complete hints for tags
1140         using prefix search.
1141         """
1142         toks = self.get_tokens(string, field='SIMPLE')
1143         top = BooleanQuery()
1144
1145         for field in ['tag_name', 'tag_name_pl']:
1146             q = self.create_prefix_phrase(toks, field)
1147             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1148
1149         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1150
1151         return self.search_tags(top, no_book_cat, max_results=max_results)
1152
1153     def hint_books(self, string, max_results=50):
1154         """
1155         Returns auto-complete hints for book titles
1156         Because we do not index 'pseudo' title-tags.
1157         Prefix search.
1158         """
1159         toks = self.get_tokens(string, field='SIMPLE')
1160
1161         q = self.create_prefix_phrase(toks, 'title')
1162
1163         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1164
1165     @staticmethod
1166     def chain_filters(filters, op=ChainedFilter.AND):
1167         """
1168         Chains a filter list together
1169         """
1170         filters = filter(lambda x: x is not None, filters)
1171         if not filters:
1172             return None
1173         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1174         return chf
1175
1176     def filtered_categories(self, tags):
1177         """
1178         Return a list of tag categories, present in tags list.
1179         """
1180         cats = {}
1181         for t in tags:
1182             cats[t.category] = True
1183         return cats.keys()
1184
1185     def hint(self):
1186         return Hint(self)