apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 import catalogue.models
  29 from multiprocessing.pool import ThreadPool
  30 from threading import current_thread
  31 import atexit
  32 import traceback
  33
  34
  35 class WLAnalyzer(PerFieldAnalyzerWrapper):
  36     def __init__(self):
  37         polish = PolishAnalyzer(Version.LUCENE_34)
  38         #        polish_gap.setPositionIncrementGap(999)
  39
  40         simple = SimpleAnalyzer(Version.LUCENE_34)
  41         #        simple_gap.setPositionIncrementGap(999)
  42
  43         keyword = KeywordAnalyzer(Version.LUCENE_34)
  44
  45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  46
  47         PerFieldAnalyzerWrapper.__init__(self, polish)
  48
  49         self.addAnalyzer("tags", simple)
  50         self.addAnalyzer("technical_editors", simple)
  51         self.addAnalyzer("editors", simple)
  52         self.addAnalyzer("url", keyword)
  53         self.addAnalyzer("source_url", keyword)
  54         self.addAnalyzer("source_name", simple)
  55         self.addAnalyzer("publisher", simple)
  56         self.addAnalyzer("authors", simple)
  57         self.addAnalyzer("title", simple)
  58
  59         self.addAnalyzer("is_book", keyword)
  60         # shouldn't the title have two forms? _pl and simple?
  61
  62         self.addAnalyzer("themes", simple)
  63         self.addAnalyzer("themes_pl", polish)
  64
  65         self.addAnalyzer("tag_name", simple)
  66         self.addAnalyzer("tag_name_pl", polish)
  67
  68         self.addAnalyzer("translators", simple)
  69
  70         self.addAnalyzer("KEYWORD", keyword)
  71         self.addAnalyzer("SIMPLE", simple)
  72         self.addAnalyzer("POLISH", polish)
  73
  74
  75 class IndexStore(object):
  76     """
  77     Provides access to search index.
  78
  79     self.store - lucene index directory
  80     """
  81     def __init__(self):
  82         self.make_index_dir()
  83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  84
  85     def make_index_dir(self):
  86         try:
  87             os.makedirs(settings.SEARCH_INDEX)
  88         except OSError as exc:
  89             if exc.errno == errno.EEXIST:
  90                 pass
  91             else: raise
  92
  93
  94 class IndexChecker(IndexStore):
  95     def __init__(self):
  96         IndexStore.__init__(self)
  97
  98     def check(self):
  99         checker = CheckIndex(self.store)
 100         status = checker.checkIndex()
 101         return status
 102
 103
 104 class Snippets(object):
 105     """
 106     This class manages snippet files for indexed object (book)
 107     the snippets are concatenated together, and their positions and
 108     lengths are kept in lucene index fields.
 109     """
 110     SNIPPET_DIR = "snippets"
 111
 112     def __init__(self, book_id):
 113         try:
 114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 115         except OSError as exc:
 116             if exc.errno == errno.EEXIST:
 117                 pass
 118             else: raise
 119         self.book_id = book_id
 120         self.file = None
 121
 122     def open(self, mode='r'):
 123         """
 124         Open the snippet file. Call .close() afterwards.
 125         """
 126         if not 'b' in mode:
 127             mode += 'b'
 128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 129         self.position = 0
 130         return self
 131
 132     def add(self, snippet):
 133         """
 134         Append a snippet (unicode) to the snippet file.
 135         Return a (position, length) tuple
 136         """
 137         txt = snippet.encode('utf-8')
 138         l = len(txt)
 139         self.file.write(txt)
 140         pos = (self.position, l)
 141         self.position += l
 142         return pos
 143
 144     def get(self, pos):
 145         """
 146         Given a tuple of (position, length) return an unicode
 147         of the snippet stored there.
 148         """
 149         self.file.seek(pos[0], 0)
 150         txt = self.file.read(pos[1]).decode('utf-8')
 151         return txt
 152
 153     def close(self):
 154         """Close snippet file"""
 155         self.file.close()
 156
 157
 158 class BaseIndex(IndexStore):
 159     """
 160     Base index class.
 161     Provides basic operations on index: opening, closing, optimizing.
 162     """
 163     def __init__(self, analyzer=None):
 164         super(BaseIndex, self).__init__()
 165         self.index = None
 166         if not analyzer:
 167             analyzer = WLAnalyzer()
 168         self.analyzer = analyzer
 169
 170     def open(self, analyzer=None):
 171         if self.index:
 172             raise Exception("Index is already opened")
 173         self.index = IndexWriter(self.store, self.analyzer,\
 174                                  IndexWriter.MaxFieldLength.LIMITED)
 175         return self.index
 176
 177     def optimize(self):
 178         self.index.optimize()
 179
 180     def close(self):
 181         try:
 182             self.index.optimize()
 183         except JavaError, je:
 184             print "Error during optimize phase, check index: %s" % je
 185
 186         self.index.close()
 187         self.index = None
 188
 189     def __enter__(self):
 190         self.open()
 191         return self
 192
 193     def __exit__(self, type, value, tb):
 194         self.close()
 195
 196
 197 class Index(BaseIndex):
 198     """
 199     Class indexing books.
 200     """
 201     def __init__(self, analyzer=None):
 202         super(Index, self).__init__(analyzer)
 203
 204     def index_tags(self):
 205         """
 206         Re-index global tag list.
 207         Removes all tags from index, then index them again.
 208         Indexed fields include: id, name (with and without polish stems), category
 209         """
 210         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 211         self.index.deleteDocuments(q)
 212
 213         for tag in catalogue.models.Tag.objects.all():
 214             doc = Document()
 215             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
 216             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 217             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 219             self.index.addDocument(doc)
 220
 221     def create_book_doc(self, book):
 222         """
 223         Create a lucene document referring book id.
 224         """
 225         doc = Document()
 226         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 227         if book.parent is not None:
 228             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 229         return doc
 230
 231     def remove_book(self, book):
 232         """Removes a book from search index.
 233         book - Book instance."""
 234         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 235         self.index.deleteDocuments(q)
 236
 237     def index_book(self, book, book_info=None, overwrite=True):
 238         """
 239         Indexes the book.
 240         Creates a lucene document for extracted metadata
 241         and calls self.index_content() to index the contents of the book.
 242         """
 243         if overwrite:
 244             self.remove_book(book)
 245
 246         book_doc = self.create_book_doc(book)
 247         meta_fields = self.extract_metadata(book, book_info)
 248         for f in meta_fields.values():
 249             if isinstance(f, list) or isinstance(f, tuple):
 250                 for elem in f:
 251                     book_doc.add(elem)
 252             else:
 253                 book_doc.add(f)
 254
 255         self.index.addDocument(book_doc)
 256         del book_doc
 257
 258         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 259
 260     master_tags = [
 261         'opowiadanie',
 262         'powiesc',
 263         'dramat_wierszowany_l',
 264         'dramat_wierszowany_lp',
 265         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 266         'wywiad'
 267         ]
 268
 269     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 270
 271     def extract_metadata(self, book, book_info=None):
 272         """
 273         Extract metadata from book and returns a map of fields keyed by fieldname
 274         """
 275         fields = {}
 276
 277         if book_info is None:
 278             book_info = dcparser.parse(open(book.xml_file.path))
 279
 280         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 281         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 282         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 283
 284         # validator, name
 285         for field in dcparser.BookInfo.FIELDS:
 286             if hasattr(book_info, field.name):
 287                 if not getattr(book_info, field.name):
 288                     continue
 289                 # since no type information is available, we use validator
 290                 type_indicator = field.validator
 291                 if type_indicator == dcparser.as_unicode:
 292                     s = getattr(book_info, field.name)
 293                     if field.multiple:
 294                         s = ', '.join(s)
 295                     try:
 296                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 297                     except JavaError as je:
 298                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 299                 elif type_indicator == dcparser.as_person:
 300                     p = getattr(book_info, field.name)
 301                     if isinstance(p, dcparser.Person):
 302                         persons = unicode(p)
 303                     else:
 304                         persons = ', '.join(map(unicode, p))
 305                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 306                 elif type_indicator == dcparser.as_date:
 307                     dt = getattr(book_info, field.name)
 308                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 309                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 310
 311         return fields
 312
 313     def add_gaps(self, fields, fieldname):
 314         """
 315         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 316         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 317         """
 318         def gap():
 319             while True:
 320                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 321         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 322
 323     def get_master(self, root):
 324         """
 325         Returns the first master tag from an etree.
 326         """
 327         for master in root.iter():
 328             if master.tag in self.master_tags:
 329                 return master
 330
 331     def index_content(self, book, book_fields=[]):
 332         """
 333         Walks the book XML and extract content from it.
 334         Adds parts for each header tag and for each fragment.
 335         """
 336         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 337         root = wld.edoc.getroot()
 338
 339         master = self.get_master(root)
 340         if master is None:
 341             return []
 342
 343         def walker(node):
 344             yield node, None
 345             for child in list(node):
 346                 for b, e in walker(child):
 347                     yield b, e
 348             yield None, node
 349             return
 350
 351         def fix_format(text):
 352             return re.sub("(?m)/$", "", text)
 353
 354         def add_part(snippets, **fields):
 355             doc = self.create_book_doc(book)
 356             for f in book_fields:
 357                 doc.add(f)
 358
 359             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 360             doc.add(NumericField("header_span", Field.Store.YES, True)\
 361                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 362             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 363
 364             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 365                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 366
 367             snip_pos = snippets.add(fields["content"])
 368             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 369             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 370
 371             if 'fragment_anchor' in fields:
 372                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 373                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 374
 375             if 'themes' in fields:
 376                 themes, themes_pl = zip(*[
 377                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 378                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 379                      for theme in fields['themes']])
 380
 381                 themes = self.add_gaps(themes, 'themes')
 382                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 383
 384                 for t in themes:
 385                     doc.add(t)
 386                 for t in themes_pl:
 387                     doc.add(t)
 388
 389             return doc
 390
 391         def give_me_utf8(s):
 392             if isinstance(s, unicode):
 393                 return s.encode('utf-8')
 394             else:
 395                 return s
 396
 397         fragments = {}
 398         snippets = Snippets(book.id).open('w')
 399         try:
 400             for header, position in zip(list(master), range(len(master))):
 401
 402                 if header.tag in self.skip_header_tags:
 403                     continue
 404
 405                 content = u' '.join([t for t in header.itertext()])
 406                 content = fix_format(content)
 407
 408                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
 409
 410                 self.index.addDocument(doc)
 411
 412                 for start, end in walker(header):
 413                     if start is not None and start.tag == 'begin':
 414                         fid = start.attrib['id'][1:]
 415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 416                         fragments[fid]['content'].append(start.tail)
 417                     elif start is not None and start.tag == 'motyw':
 418                         fid = start.attrib['id'][1:]
 419                         if start.text is not None:
 420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 421                         fragments[fid]['content'].append(start.tail)
 422                     elif start is not None and start.tag == 'end':
 423                         fid = start.attrib['id'][1:]
 424                         if fid not in fragments:
 425                             continue  # a broken <end> node, skip it
 426                         frag = fragments[fid]
 427                         if frag['themes'] == []:
 428                             continue  # empty themes list.
 429                         del fragments[fid]
 430
 431                         def jstr(l):
 432                             return u' '.join(map(
 433                                 lambda x: x == None and u'(none)' or unicode(x),
 434                                 l))
 435
 436                         doc = add_part(snippets,
 437                                        header_type=frag['start_header'],
 438                                        header_index=frag['start_section'],
 439                                        header_span=position - frag['start_section'] + 1,
 440                                        fragment_anchor=fid,
 441                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 442                                        themes=frag['themes'])
 443
 444                         self.index.addDocument(doc)
 445                     elif start is not None:
 446                         for frag in fragments.values():
 447                             frag['content'].append(start.text)
 448                     elif end is not None:
 449                         for frag in fragments.values():
 450                             frag['content'].append(end.tail)
 451         finally:
 452             snippets.close()
 453
 454
 455 def log_exception_wrapper(f):
 456     def _wrap(*a):
 457         try:
 458             f(*a)
 459         except Exception, e:
 460             print("Error in indexing thread: %s" % e)
 461             traceback.print_exc()
 462             raise e
 463     return _wrap
 464
 465
 466 class ReusableIndex(Index):
 467     """
 468     Works like index, but does not close/optimize Lucene index
 469     until program exit (uses atexit hook).
 470     This is usefull for importbooks command.
 471
 472     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 473     """
 474     index = None
 475
 476     def open(self, analyzer=None, threads=4):
 477         if ReusableIndex.index is not None:
 478             self.index = ReusableIndex.index
 479         else:
 480             print("opening index")
 481             Index.open(self, analyzer)
 482             ReusableIndex.index = self.index
 483             atexit.register(ReusableIndex.close_reusable)
 484
 485     # def index_book(self, *args, **kw):
 486     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 487     #     ReusableIndex.pool_jobs.append(job)
 488
 489     @staticmethod
 490     def close_reusable():
 491         if ReusableIndex.index is not None:
 492             ReusableIndex.index.optimize()
 493             ReusableIndex.index.close()
 494             ReusableIndex.index = None
 495
 496     def close(self):
 497         pass
 498
 499
 500 class JoinSearch(object):
 501     """
 502     This mixin could be used to handle block join queries.
 503     (currently unused)
 504     """
 505     def __init__(self, *args, **kw):
 506         super(JoinSearch, self).__init__(*args, **kw)
 507
 508     def wrapjoins(self, query, fields=[]):
 509         """
 510         This functions modifies the query in a recursive way,
 511         so Term and Phrase Queries contained, which match
 512         provided fields are wrapped in a BlockJoinQuery,
 513         and so delegated to children documents.
 514         """
 515         if BooleanQuery.instance_(query):
 516             qs = BooleanQuery.cast_(query)
 517             for clause in qs:
 518                 clause = BooleanClause.cast_(clause)
 519                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 520             return qs
 521         else:
 522             termset = HashSet()
 523             query.extractTerms(termset)
 524             for t in termset:
 525                 t = Term.cast_(t)
 526                 if t.field() not in fields:
 527                     return query
 528             return BlockJoinQuery(query, self.parent_filter,
 529                                   BlockJoinQuery.ScoreMode.Total)
 530
 531     def bsearch(self, query, max_results=50):
 532         q = self.query(query)
 533         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 534
 535         tops = self.searcher.search(bjq, max_results)
 536         bks = []
 537         for found in tops.scoreDocs:
 538             doc = self.searcher.doc(found.doc)
 539             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 540         return (bks, tops.totalHits)
 541
 542
 543 class SearchResult(object):
 544     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
 545         if score:
 546             self.score = score
 547         else:
 548             self.score = scoreDocs.score
 549
 550         self._hits = []
 551         self.hits = None  # processed hits
 552
 553         stored = searcher.doc(scoreDocs.doc)
 554         self.book_id = int(stored.get("book_id"))
 555
 556         header_type = stored.get("header_type")
 557         if not header_type:
 558             return
 559
 560         sec = (header_type, int(stored.get("header_index")))
 561         header_span = stored.get('header_span')
 562         header_span = header_span is not None and int(header_span) or 1
 563
 564         fragment = stored.get("fragment_anchor")
 565
 566         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 567
 568         self._hits.append(hit)
 569
 570     def merge(self, other):
 571         if self.book_id != other.book_id:
 572             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 573         self._hits += other._hits
 574         if other.score > self.score:
 575             self.score = other.score
 576         return self
 577
 578     def get_book(self):
 579         return catalogue.models.Book.objects.get(id=self.book_id)
 580
 581     book = property(get_book)
 582
 583     def process_hits(self):
 584         POSITION = 0
 585         FRAGMENT = 1
 586         POSITION_INDEX = 1
 587         POSITION_SPAN = 2
 588         SCORE = 2
 589         OTHER = 3
 590
 591         # to sections and fragments
 592         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 593         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 594         sect = filter(lambda s: 0 == len(filter(
 595             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 596             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 597             frags)), sect)
 598
 599         hits = []
 600
 601         # remove duplicate fragments
 602         fragments = {}
 603         for f in frags:
 604             fid = f[FRAGMENT]
 605             if fid in fragments:
 606                 if fragments[fid][SCORE] >= f[SCORE]:
 607                     continue
 608             fragments[fid] = f
 609         frags = fragments.values()
 610
 611         # remove duplicate sections
 612         sections = {}
 613
 614         for s in sect:
 615             si = s[POSITION][POSITION_INDEX]
 616             # skip existing
 617             if si in sections:
 618                 if sections[si]['score'] >= s[SCORE]:
 619                     continue
 620
 621             m = {'score': s[SCORE],
 622                  'header_index': s[POSITION][POSITION_INDEX]
 623                  }
 624             m.update(s[OTHER])
 625             sections[si] = m
 626
 627         hits = sections.values()
 628
 629         for f in frags:
 630             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 631             m = {'score': f[SCORE],
 632                  'fragment': frag,
 633                  'themes': frag.tags.filter(category='theme')
 634                  }
 635             m.update(f[OTHER])
 636             hits.append(m)
 637
 638         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 639
 640         self.hits = hits
 641
 642         return self
 643
 644     def __unicode__(self):
 645         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 646
 647     @staticmethod
 648     def aggregate(*result_lists):
 649         books = {}
 650         for rl in result_lists:
 651             for r in rl:
 652                 if r.book_id in books:
 653                     books[r.book_id].merge(r)
 654                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 655                 else:
 656                     books[r.book_id] = r
 657         return books.values()
 658
 659     def __cmp__(self, other):
 660         return cmp(self.score, other.score)
 661
 662
 663 class Hint(object):
 664     """
 665     Given some hint information (information we already know about)
 666     our search target - like author, title (specific book), epoch, genre, kind
 667     we can narrow down search using filters.
 668     """
 669     def __init__(self, search):
 670         """
 671         Accepts a Searcher instance.
 672         """
 673         self.search = search
 674         self.book_tags = {}
 675         self.part_tags = []
 676         self._books = []
 677
 678     def books(self, *books):
 679         """
 680         Give a hint that we search these books.
 681         """
 682         self._books = books
 683
 684     def tags(self, tags):
 685         """
 686         Give a hint that these Tag objects (a list of)
 687         is necessary.
 688         """
 689         for t in tags:
 690             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 691                 lst = self.book_tags.get(t.category, [])
 692                 lst.append(t)
 693                 self.book_tags[t.category] = lst
 694             if t.category in ['theme', 'theme_pl']:
 695                 self.part_tags.append(t)
 696
 697     def tag_filter(self, tags, field='tags'):
 698         """
 699         Given a lsit of tags and an optional field (but they are normally in tags field)
 700         returns a filter accepting only books with specific tags.
 701         """
 702         q = BooleanQuery()
 703
 704         for tag in tags:
 705             toks = self.search.get_tokens(tag.name, field=field)
 706             tag_phrase = PhraseQuery()
 707             for tok in toks:
 708                 tag_phrase.add(Term(field, tok))
 709             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 710
 711         return QueryWrapperFilter(q)
 712
 713     def book_filter(self):
 714         """
 715         Filters using book tags (all tag kinds except a theme)
 716         """
 717         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 718         if tags:
 719             return self.tag_filter(tags)
 720         else:
 721             return None
 722
 723     def part_filter(self):
 724         """
 725         This filter can be used to look for book parts.
 726         It filters on book id and/or themes.
 727         """
 728         fs = []
 729         if self.part_tags:
 730             fs.append(self.tag_filter(self.part_tags, field='themes'))
 731
 732         if self._books != []:
 733             bf = BooleanFilter()
 734             for b in self._books:
 735                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 736                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 737             fs.append(bf)
 738
 739         return Search.chain_filters(fs)
 740
 741     def should_search_for_book(self):
 742         return self._books == []
 743
 744     def just_search_in(self, all):
 745         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 746         some = []
 747         for field in all:
 748             if field == 'authors' and 'author' in self.book_tags:
 749                 continue
 750             if field == 'title' and self._books != []:
 751                 continue
 752             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 753                 continue
 754             some.append(field)
 755         return some
 756
 757
 758 class Search(IndexStore):
 759     """
 760     Search facilities.
 761     """
 762     def __init__(self, default_field="content"):
 763         IndexStore.__init__(self)
 764         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 765         # self.analyzer = WLAnalyzer()
 766         self.searcher = IndexSearcher(self.store, True)
 767         self.parser = QueryParser(Version.LUCENE_34, default_field,
 768                                   self.analyzer)
 769
 770         self.parent_filter = TermsFilter()
 771         self.parent_filter.addTerm(Term("is_book", "true"))
 772
 773     def query(self, query):
 774         """Parse query in default Lucene Syntax. (for humans)
 775         """
 776         return self.parser.parse(query)
 777
 778     def simple_search(self, query, max_results=50):
 779         """Runs a query for books using lucene syntax. (for humans)
 780         Returns (books, total_hits)
 781         """
 782
 783         tops = self.searcher.search(self.query(query), max_results)
 784         bks = []
 785         for found in tops.scoreDocs:
 786             doc = self.searcher.doc(found.doc)
 787             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 788         return (bks, tops.totalHits)
 789
 790     def get_tokens(self, searched, field='content'):
 791         """returns tokens analyzed by a proper (for a field) analyzer
 792         argument can be: StringReader, string/unicode, or tokens. In the last case
 793         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 794         """
 795         if isinstance(searched, str) or isinstance(searched, unicode):
 796             searched = StringReader(searched)
 797         elif isinstance(searched, list):
 798             return searched
 799
 800         searched.reset()
 801         tokens = self.analyzer.reusableTokenStream(field, searched)
 802         toks = []
 803         while tokens.incrementToken():
 804             cta = tokens.getAttribute(CharTermAttribute.class_)
 805             toks.append(cta.toString())
 806         return toks
 807
 808     def fuzziness(self, fuzzy):
 809         """Helper method to sanitize fuzziness"""
 810         if not fuzzy:
 811             return None
 812         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 813             return fuzzy
 814         else:
 815             return 0.5
 816
 817     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 818         """
 819         Return a PhraseQuery with a series of tokens.
 820         """
 821         if fuzzy:
 822             phrase = MultiPhraseQuery()
 823             for t in tokens:
 824                 term = Term(field, t)
 825                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 826                 fuzzterms = []
 827
 828                 while True:
 829                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 830                     ft = fuzzterm.term()
 831                     if ft:
 832                         fuzzterms.append(ft)
 833                     if not fuzzterm.next(): break
 834                 if fuzzterms:
 835                     phrase.add(JArray('object')(fuzzterms, Term))
 836                 else:
 837                     phrase.add(term)
 838         else:
 839             phrase = PhraseQuery()
 840             phrase.setSlop(slop)
 841             for t in tokens:
 842                 term = Term(field, t)
 843                 phrase.add(term)
 844         return phrase
 845
 846     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 847         """
 848         Returns term queries joined by boolean query.
 849         modal - applies to boolean query
 850         fuzzy - should the query by fuzzy.
 851         """
 852         q = BooleanQuery()
 853         for t in tokens:
 854             term = Term(field, t)
 855             if fuzzy:
 856                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 857             else:
 858                 term = TermQuery(term)
 859             q.add(BooleanClause(term, modal))
 860         return q
 861
 862     # def content_query(self, query):
 863     #     return BlockJoinQuery(query, self.parent_filter,
 864     #                           BlockJoinQuery.ScoreMode.Total)
 865
 866     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 867         """
 868         Search for perfect book matches. Just see if the query matches with some author or title,
 869         taking hints into account.
 870         """
 871         fields_to_search = ['authors', 'title']
 872         only_in = None
 873         if hint:
 874             if not hint.should_search_for_book():
 875                 return []
 876             fields_to_search = hint.just_search_in(fields_to_search)
 877             only_in = hint.book_filter()
 878
 879         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 880
 881         books = []
 882         for q in qrys:
 883             top = self.searcher.search(q,
 884                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 885                 max_results)
 886             for found in top.scoreDocs:
 887                 books.append(SearchResult(self.searcher, found))
 888         return books
 889
 890     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 891         fields_to_search = ['tags', 'authors', 'title']
 892
 893         only_in = None
 894         if hint:
 895             if not hint.should_search_for_book():
 896                 return []
 897             fields_to_search = hint.just_search_in(fields_to_search)
 898             only_in = hint.book_filter()
 899
 900         tokens = self.get_tokens(searched, field='SIMPLE')
 901
 902         q = BooleanQuery()
 903
 904         for fld in fields_to_search:
 905             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 906                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 907
 908         books = []
 909         top = self.searcher.search(q,
 910                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 911             max_results)
 912         for found in top.scoreDocs:
 913             books.append(SearchResult(self.searcher, found))
 914
 915         return books
 916
 917     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 918         """
 919         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
 920         some part/fragment of the book.
 921         """
 922         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 923
 924         flt = None
 925         if hint:
 926             flt = hint.part_filter()
 927
 928         books = []
 929         for q in qrys:
 930             top = self.searcher.search(q,
 931                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 932                                                            flt]),
 933                                        max_results)
 934             for found in top.scoreDocs:
 935                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
 936
 937         return books
 938
 939     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
 940         """
 941         Tries to use search terms to match different fields of book (or its parts).
 942         E.g. one word can be an author survey, another be a part of the title, and the rest
 943         are some words from third chapter.
 944         """
 945         books = []
 946         only_in = None
 947
 948         if hint:
 949             only_in = hint.part_filter()
 950
 951         # content only query : themes x content
 952         q = BooleanQuery()
 953
 954         tokens_pl = self.get_tokens(searched, field='content')
 955         tokens = self.get_tokens(searched, field='SIMPLE')
 956
 957         # only search in themes when we do not already filter by themes
 958         if hint is None or hint.just_search_in(['themes']) != []:
 959             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 960                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 961
 962         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 963                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 964
 965         topDocs = self.searcher.search(q, only_in, max_results)
 966         for found in topDocs.scoreDocs:
 967             books.append(SearchResult(self.searcher, found))
 968             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 969
 970         # query themes/content x author/title/tags
 971         q = BooleanQuery()
 972         in_content = BooleanQuery()
 973         in_meta = BooleanQuery()
 974
 975         for fld in ['themes_pl', 'content']:
 976             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 977
 978         for fld in ['tags', 'authors', 'title']:
 979             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 980
 981         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 982         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 983
 984         topDocs = self.searcher.search(q, only_in, max_results)
 985         for found in topDocs.scoreDocs:
 986             books.append(SearchResult(self.searcher, found))
 987             print "* %s scatter search: %s" % (searched, books[-1]._hits)
 988
 989         return books
 990
 991     # def multisearch(self, query, max_results=50):
 992     #     """
 993     #     Search strategy:
 994     #     - (phrase) OR -> content
 995     #                   -> title
 996     #                   -> authors
 997     #     - (keywords)  -> authors
 998     #                   -> motyw
 999     #                   -> tags
1000     #                   -> content
1001     #     """
1002         # queryreader = StringReader(query)
1003         # tokens = self.get_tokens(queryreader)
1004
1005         # top_level = BooleanQuery()
1006         # Should = BooleanClause.Occur.SHOULD
1007
1008         # phrase_level = BooleanQuery()
1009         # phrase_level.setBoost(1.3)
1010
1011         # p_content = self.make_phrase(tokens, joined=True)
1012         # p_title = self.make_phrase(tokens, 'title')
1013         # p_author = self.make_phrase(tokens, 'author')
1014
1015         # phrase_level.add(BooleanClause(p_content, Should))
1016         # phrase_level.add(BooleanClause(p_title, Should))
1017         # phrase_level.add(BooleanClause(p_author, Should))
1018
1019         # kw_level = BooleanQuery()
1020
1021         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1022         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1023         # kw_level.add(j_themes, Should)
1024         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1025         # j_con = self.make_term_query(tokens, joined=True)
1026         # kw_level.add(j_con, Should)
1027
1028         # top_level.add(BooleanClause(phrase_level, Should))
1029         # top_level.add(BooleanClause(kw_level, Should))
1030
1031         # return None
1032
1033
1034     def get_snippets(self, scoreDoc, query, field='content'):
1035         """
1036         Returns a snippet for found scoreDoc.
1037         """
1038         htmlFormatter = SimpleHTMLFormatter()
1039         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1040
1041         stored = self.searcher.doc(scoreDoc.doc)
1042
1043         # locate content.
1044         snippets = Snippets(stored.get('book_id')).open()
1045         try:
1046             text = snippets.get((int(stored.get('snippets_position')),
1047                                  int(stored.get('snippets_length'))))
1048         finally:
1049             snippets.close()
1050
1051         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1052         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1053         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1054
1055         return snip
1056
1057     @staticmethod
1058     def enum_to_array(enum):
1059         """
1060         Converts a lucene TermEnum to array of Terms, suitable for
1061         addition to queries
1062         """
1063         terms = []
1064
1065         while True:
1066             t = enum.term()
1067             if t:
1068                 terms.append(t)
1069             if not enum.next(): break
1070
1071         if terms:
1072             return JArray('object')(terms, Term)
1073
1074     def search_tags(self, query, filter=None, max_results=40):
1075         """
1076         Search for Tag objects using query.
1077         """
1078         tops = self.searcher.search(query, filter, max_results)
1079
1080         tags = []
1081         for found in tops.scoreDocs:
1082             doc = self.searcher.doc(found.doc)
1083             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1084             tags.append(tag)
1085             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1086
1087         return tags
1088
1089     def search_books(self, query, filter=None, max_results=10):
1090         """
1091         Searches for Book objects using query
1092         """
1093         bks = []
1094         tops = self.searcher.search(query, filter, max_results)
1095         for found in tops.scoreDocs:
1096             doc = self.searcher.doc(found.doc)
1097             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1098         return bks
1099
1100     def create_prefix_phrase(self, toks, field):
1101         q = MultiPhraseQuery()
1102         for i in range(len(toks)):
1103             t = Term(field, toks[i])
1104             if i == len(toks) - 1:
1105                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1106                 if pterms:
1107                     q.add(pterms)
1108                 else:
1109                     q.add(t)
1110             else:
1111                 q.add(t)
1112         return q
1113
1114     @staticmethod
1115     def term_filter(term, inverse=False):
1116         only_term = TermsFilter()
1117         only_term.addTerm(term)
1118
1119         if inverse:
1120             neg = BooleanFilter()
1121             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1122             only_term = neg
1123
1124         return only_term
1125
1126     def hint_tags(self, string, max_results=50):
1127         """
1128         Return auto-complete hints for tags
1129         using prefix search.
1130         """
1131         toks = self.get_tokens(string, field='SIMPLE')
1132         top = BooleanQuery()
1133
1134         for field in ['tag_name', 'tag_name_pl']:
1135             q = self.create_prefix_phrase(toks, field)
1136             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1137
1138         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1139
1140         return self.search_tags(top, no_book_cat, max_results=max_results)
1141
1142     def hint_books(self, string, max_results=50):
1143         """
1144         Returns auto-complete hints for book titles
1145         Because we do not index 'pseudo' title-tags.
1146         Prefix search.
1147         """
1148         toks = self.get_tokens(string, field='SIMPLE')
1149
1150         q = self.create_prefix_phrase(toks, 'title')
1151
1152         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1153
1154     @staticmethod
1155     def chain_filters(filters, op=ChainedFilter.AND):
1156         """
1157         Chains a filter list together
1158         """
1159         filters = filter(lambda x: x is not None, filters)
1160         if not filters:
1161             return None
1162         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1163         return chf
1164
1165     def filtered_categories(self, tags):
1166         """
1167         Return a list of tag categories, present in tags list.
1168         """
1169         cats = {}
1170         for t in tags:
1171             cats[t.category] = True
1172         return cats.keys()
1173
1174     def hint(self):
1175         return Hint(self)