apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 import catalogue.models
  29 from multiprocessing.pool import ThreadPool
  30 from threading import current_thread
  31 import atexit
  32 import traceback
  33
  34
  35 class WLAnalyzer(PerFieldAnalyzerWrapper):
  36     def __init__(self):
  37         polish = PolishAnalyzer(Version.LUCENE_34)
  38         #        polish_gap.setPositionIncrementGap(999)
  39
  40         simple = SimpleAnalyzer(Version.LUCENE_34)
  41         #        simple_gap.setPositionIncrementGap(999)
  42
  43         keyword = KeywordAnalyzer(Version.LUCENE_34)
  44
  45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  46
  47         PerFieldAnalyzerWrapper.__init__(self, polish)
  48
  49         self.addAnalyzer("tags", simple)
  50         self.addAnalyzer("technical_editors", simple)
  51         self.addAnalyzer("editors", simple)
  52         self.addAnalyzer("url", keyword)
  53         self.addAnalyzer("source_url", keyword)
  54         self.addAnalyzer("source_name", simple)
  55         self.addAnalyzer("publisher", simple)
  56         self.addAnalyzer("authors", simple)
  57         self.addAnalyzer("is_book", keyword)
  58         # shouldn't the title have two forms? _pl and simple?
  59
  60         self.addAnalyzer("themes", simple)
  61         self.addAnalyzer("themes_pl", polish)
  62
  63         self.addAnalyzer("tag_name", simple)
  64         self.addAnalyzer("tag_name_pl", polish)
  65
  66         self.addAnalyzer("translators", simple)
  67
  68         self.addAnalyzer("KEYWORD", keyword)
  69         self.addAnalyzer("SIMPLE", simple)
  70         self.addAnalyzer("POLISH", polish)
  71
  72
  73 class IndexStore(object):
  74     """
  75     Provides access to search index.
  76
  77     self.store - lucene index directory
  78     """
  79     def __init__(self):
  80         self.make_index_dir()
  81         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  82
  83     def make_index_dir(self):
  84         try:
  85             os.makedirs(settings.SEARCH_INDEX)
  86         except OSError as exc:
  87             if exc.errno == errno.EEXIST:
  88                 pass
  89             else: raise
  90
  91
  92 class IndexChecker(IndexStore):
  93     def __init__(self):
  94         IndexStore.__init__(self)
  95
  96     def check(self):
  97         checker = CheckIndex(self.store)
  98         status = checker.checkIndex()
  99         return status
 100
 101
 102 class Snippets(object):
 103     """
 104     This class manages snippet files for indexed object (book)
 105     the snippets are concatenated together, and their positions and
 106     lengths are kept in lucene index fields.
 107     """
 108     SNIPPET_DIR = "snippets"
 109
 110     def __init__(self, book_id):
 111         try:
 112             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 113         except OSError as exc:
 114             if exc.errno == errno.EEXIST:
 115                 pass
 116             else: raise
 117         self.book_id = book_id
 118         self.file = None
 119
 120     def open(self, mode='r'):
 121         """
 122         Open the snippet file. Call .close() afterwards.
 123         """
 124         if not 'b' in mode:
 125             mode += 'b'
 126         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 127         self.position = 0
 128         return self
 129
 130     def add(self, snippet):
 131         """
 132         Append a snippet (unicode) to the snippet file.
 133         Return a (position, length) tuple
 134         """
 135         txt = snippet.encode('utf-8')
 136         l = len(txt)
 137         self.file.write(txt)
 138         pos = (self.position, l)
 139         self.position += l
 140         return pos
 141
 142     def get(self, pos):
 143         """
 144         Given a tuple of (position, length) return an unicode
 145         of the snippet stored there.
 146         """
 147         self.file.seek(pos[0], 0)
 148         txt = self.file.read(pos[1]).decode('utf-8')
 149         return txt
 150
 151     def close(self):
 152         """Close snippet file"""
 153         self.file.close()
 154
 155
 156 class BaseIndex(IndexStore):
 157     """
 158     Base index class.
 159     Provides basic operations on index: opening, closing, optimizing.
 160     """
 161     def __init__(self, analyzer=None):
 162         super(BaseIndex, self).__init__()
 163         self.index = None
 164         if not analyzer:
 165             analyzer = WLAnalyzer()
 166         self.analyzer = analyzer
 167
 168     def open(self, analyzer=None):
 169         if self.index:
 170             raise Exception("Index is already opened")
 171         self.index = IndexWriter(self.store, self.analyzer,\
 172                                  IndexWriter.MaxFieldLength.LIMITED)
 173         return self.index
 174
 175     def optimize(self):
 176         self.index.optimize()
 177
 178     def close(self):
 179         try:
 180             self.index.optimize()
 181         except JavaError, je:
 182             print "Error during optimize phase, check index: %s" % je
 183
 184         self.index.close()
 185         self.index = None
 186
 187     def __enter__(self):
 188         self.open()
 189         return self
 190
 191     def __exit__(self, type, value, tb):
 192         self.close()
 193
 194
 195 class Index(BaseIndex):
 196     """
 197     Class indexing books.
 198     """
 199     def __init__(self, analyzer=None):
 200         super(Index, self).__init__(analyzer)
 201
 202     def index_tags(self):
 203         """
 204         Re-index global tag list.
 205         Removes all tags from index, then index them again.
 206         Indexed fields include: id, name (with and without polish stems), category
 207         """
 208         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 209         self.index.deleteDocuments(q)
 210
 211         for tag in catalogue.models.Tag.objects.all():
 212             doc = Document()
 213             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
 214             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 215             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 216             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 217             self.index.addDocument(doc)
 218
 219     def create_book_doc(self, book):
 220         """
 221         Create a lucene document referring book id.
 222         """
 223         doc = Document()
 224         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 225         if book.parent is not None:
 226             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 227         return doc
 228
 229     def remove_book(self, book):
 230         """Removes a book from search index.
 231         book - Book instance."""
 232         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 233         self.index.deleteDocuments(q)
 234
 235     def index_book(self, book, book_info=None, overwrite=True):
 236         """
 237         Indexes the book.
 238         Creates a lucene document for extracted metadata
 239         and calls self.index_content() to index the contents of the book.
 240         """
 241         if overwrite:
 242             self.remove_book(book)
 243
 244         book_doc = self.create_book_doc(book)
 245         meta_fields = self.extract_metadata(book, book_info)
 246         for f in meta_fields.values():
 247             if isinstance(f, list) or isinstance(f, tuple):
 248                 for elem in f:
 249                     book_doc.add(elem)
 250             else:
 251                 book_doc.add(f)
 252
 253         self.index.addDocument(book_doc)
 254         del book_doc
 255
 256         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 257
 258     master_tags = [
 259         'opowiadanie',
 260         'powiesc',
 261         'dramat_wierszowany_l',
 262         'dramat_wierszowany_lp',
 263         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 264         'wywiad'
 265         ]
 266
 267     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 268
 269     def extract_metadata(self, book, book_info=None):
 270         """
 271         Extract metadata from book and returns a map of fields keyed by fieldname
 272         """
 273         fields = {}
 274
 275         if book_info is None:
 276             book_info = dcparser.parse(open(book.xml_file.path))
 277
 278         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 279         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 280         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 281
 282         # validator, name
 283         for field in dcparser.BookInfo.FIELDS:
 284             if hasattr(book_info, field.name):
 285                 if not getattr(book_info, field.name):
 286                     continue
 287                 # since no type information is available, we use validator
 288                 type_indicator = field.validator
 289                 if type_indicator == dcparser.as_unicode:
 290                     s = getattr(book_info, field.name)
 291                     if field.multiple:
 292                         s = ', '.join(s)
 293                     try:
 294                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 295                     except JavaError as je:
 296                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 297                 elif type_indicator == dcparser.as_person:
 298                     p = getattr(book_info, field.name)
 299                     if isinstance(p, dcparser.Person):
 300                         persons = unicode(p)
 301                     else:
 302                         persons = ', '.join(map(unicode, p))
 303                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 304                 elif type_indicator == dcparser.as_date:
 305                     dt = getattr(book_info, field.name)
 306                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 307                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 308
 309         return fields
 310
 311     def add_gaps(self, fields, fieldname):
 312         """
 313         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 314         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 315         """
 316         def gap():
 317             while True:
 318                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 319         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 320
 321     def get_master(self, root):
 322         """
 323         Returns the first master tag from an etree.
 324         """
 325         for master in root.iter():
 326             if master.tag in self.master_tags:
 327                 return master
 328
 329     def index_content(self, book, book_fields=[]):
 330         """
 331         Walks the book XML and extract content from it.
 332         Adds parts for each header tag and for each fragment.
 333         """
 334         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 335         root = wld.edoc.getroot()
 336
 337         master = self.get_master(root)
 338         if master is None:
 339             return []
 340
 341         def walker(node):
 342             yield node, None
 343             for child in list(node):
 344                 for b, e in walker(child):
 345                     yield b, e
 346             yield None, node
 347             return
 348
 349         def fix_format(text):
 350             return re.sub("(?m)/$", "", text)
 351
 352         def add_part(snippets, **fields):
 353             doc = self.create_book_doc(book)
 354             for f in book_fields:
 355                 doc.add(f)
 356
 357             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 358             doc.add(NumericField("header_span", Field.Store.YES, True)\
 359                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 360             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 361
 362             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 363                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 364
 365             snip_pos = snippets.add(fields["content"])
 366             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 367             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 368
 369             if 'fragment_anchor' in fields:
 370                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 371                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 372
 373             if 'themes' in fields:
 374                 themes, themes_pl = zip(*[
 375                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 376                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 377                      for theme in fields['themes']])
 378
 379                 themes = self.add_gaps(themes, 'themes')
 380                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 381
 382                 for t in themes:
 383                     doc.add(t)
 384                 for t in themes_pl:
 385                     doc.add(t)
 386
 387             return doc
 388
 389         def give_me_utf8(s):
 390             if isinstance(s, unicode):
 391                 return s.encode('utf-8')
 392             else:
 393                 return s
 394
 395         fragments = {}
 396         snippets = Snippets(book.id).open('w')
 397         try:
 398             for header, position in zip(list(master), range(len(master))):
 399
 400                 if header.tag in self.skip_header_tags:
 401                     continue
 402
 403                 content = u' '.join([t for t in header.itertext()])
 404                 content = fix_format(content)
 405
 406                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
 407
 408                 self.index.addDocument(doc)
 409
 410                 for start, end in walker(header):
 411                     if start is not None and start.tag == 'begin':
 412                         fid = start.attrib['id'][1:]
 413                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 414                         fragments[fid]['content'].append(start.tail)
 415                     elif start is not None and start.tag == 'motyw':
 416                         fid = start.attrib['id'][1:]
 417                         if start.text is not None:
 418                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 419                         fragments[fid]['content'].append(start.tail)
 420                     elif start is not None and start.tag == 'end':
 421                         fid = start.attrib['id'][1:]
 422                         if fid not in fragments:
 423                             continue  # a broken <end> node, skip it
 424                         frag = fragments[fid]
 425                         if frag['themes'] == []:
 426                             continue  # empty themes list.
 427                         del fragments[fid]
 428
 429                         def jstr(l):
 430                             return u' '.join(map(
 431                                 lambda x: x == None and u'(none)' or unicode(x),
 432                                 l))
 433
 434                         doc = add_part(snippets,
 435                                        header_type=frag['start_header'],
 436                                        header_index=frag['start_section'],
 437                                        header_span=position - frag['start_section'] + 1,
 438                                        fragment_anchor=fid,
 439                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 440                                        themes=frag['themes'])
 441
 442                         self.index.addDocument(doc)
 443                     elif start is not None:
 444                         for frag in fragments.values():
 445                             frag['content'].append(start.text)
 446                     elif end is not None:
 447                         for frag in fragments.values():
 448                             frag['content'].append(end.tail)
 449         finally:
 450             snippets.close()
 451
 452
 453 def log_exception_wrapper(f):
 454     def _wrap(*a):
 455         try:
 456             f(*a)
 457         except Exception, e:
 458             print("Error in indexing thread: %s" % e)
 459             traceback.print_exc()
 460             raise e
 461     return _wrap
 462
 463
 464 class ReusableIndex(Index):
 465     """
 466     Works like index, but does not close/optimize Lucene index
 467     until program exit (uses atexit hook).
 468     This is usefull for importbooks command.
 469
 470     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 471     """
 472     index = None
 473
 474     def open(self, analyzer=None, threads=4):
 475         if ReusableIndex.index is not None:
 476             self.index = ReusableIndex.index
 477         else:
 478             print("opening index")
 479             Index.open(self, analyzer)
 480             ReusableIndex.index = self.index
 481             atexit.register(ReusableIndex.close_reusable)
 482
 483     # def index_book(self, *args, **kw):
 484     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 485     #     ReusableIndex.pool_jobs.append(job)
 486
 487     @staticmethod
 488     def close_reusable():
 489         if ReusableIndex.index is not None:
 490             ReusableIndex.index.optimize()
 491             ReusableIndex.index.close()
 492             ReusableIndex.index = None
 493
 494     def close(self):
 495         pass
 496
 497
 498 class JoinSearch(object):
 499     """
 500     This mixin could be used to handle block join queries.
 501     (currently unused)
 502     """
 503     def __init__(self, *args, **kw):
 504         super(JoinSearch, self).__init__(*args, **kw)
 505
 506     def wrapjoins(self, query, fields=[]):
 507         """
 508         This functions modifies the query in a recursive way,
 509         so Term and Phrase Queries contained, which match
 510         provided fields are wrapped in a BlockJoinQuery,
 511         and so delegated to children documents.
 512         """
 513         if BooleanQuery.instance_(query):
 514             qs = BooleanQuery.cast_(query)
 515             for clause in qs:
 516                 clause = BooleanClause.cast_(clause)
 517                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 518             return qs
 519         else:
 520             termset = HashSet()
 521             query.extractTerms(termset)
 522             for t in termset:
 523                 t = Term.cast_(t)
 524                 if t.field() not in fields:
 525                     return query
 526             return BlockJoinQuery(query, self.parent_filter,
 527                                   BlockJoinQuery.ScoreMode.Total)
 528
 529     def bsearch(self, query, max_results=50):
 530         q = self.query(query)
 531         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 532
 533         tops = self.searcher.search(bjq, max_results)
 534         bks = []
 535         for found in tops.scoreDocs:
 536             doc = self.searcher.doc(found.doc)
 537             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 538         return (bks, tops.totalHits)
 539
 540
 541 class SearchResult(object):
 542     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
 543         self.snippets = []
 544
 545         if score:
 546             self.score = score
 547         else:
 548             self.score = scoreDocs.score
 549
 550         self.hits = []
 551
 552         stored = searcher.doc(scoreDocs.doc)
 553         self.book_id = int(stored.get("book_id"))
 554
 555         header_type = stored.get("header_type")
 556         if not header_type:
 557             return
 558
 559         sec = (header_type, int(stored.get("header_index")))
 560         header_span = stored.get('header_span')
 561         header_span = header_span is not None and int(header_span) or 1
 562
 563         fragment = stored.get("fragment_anchor")
 564
 565         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': [snippets]})
 566
 567         self.hits.append(hit)
 568
 569     def merge(self, other):
 570         if self.book_id != other.book_id:
 571             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 572         self.hits += other.hits
 573         if other.score > self.score:
 574             self.score = other.score
 575         return self
 576
 577     def get_book(self):
 578         return catalogue.models.Book.objects.get(id=self.book_id)
 579
 580     book = property(get_book)
 581
 582     def process_hits(self):
 583         frags = filter(lambda r: r[1] is not None, self.hits)
 584         sect = filter(lambda r: r[1] is None, self.hits)
 585         sect = filter(lambda s: 0 == len(filter(
 586             lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
 587             frags)), sect)
 588
 589         hits = []
 590
 591         for s in sect:
 592             m = {'score': s[2],
 593                  'header_index': s[0][1]
 594                  }
 595             m.update(s[3])
 596             hits.append(m)
 597
 598         for f in frags:
 599             frag = catalogue.models.Fragment.objects.get(anchor=f[1])
 600             m = {'score': f[2],
 601                  'fragment': frag,
 602                  'themes': frag.tags.filter(category='theme')
 603                  }
 604             m.update(f[3])
 605             hits.append(m)
 606
 607         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 608
 609         print("--- %s" % hits)
 610
 611         return hits
 612
 613     def __unicode__(self):
 614         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 615
 616     @staticmethod
 617     def aggregate(*result_lists):
 618         books = {}
 619         for rl in result_lists:
 620             for r in rl:
 621                 if r.book_id in books:
 622                     books[r.book_id].merge(r)
 623                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 624                 else:
 625                     books[r.book_id] = r
 626         return books.values()
 627
 628     def __cmp__(self, other):
 629         return cmp(self.score, other.score)
 630
 631
 632 class Hint(object):
 633     """
 634     Given some hint information (information we already know about)
 635     our search target - like author, title (specific book), epoch, genre, kind
 636     we can narrow down search using filters.
 637     """
 638     def __init__(self, search):
 639         """
 640         Accepts a Searcher instance.
 641         """
 642         self.search = search
 643         self.book_tags = {}
 644         self.part_tags = []
 645         self._books = []
 646
 647     def books(self, *books):
 648         """
 649         Give a hint that we search these books.
 650         """
 651         self._books = books
 652
 653     def tags(self, tags):
 654         """
 655         Give a hint that these Tag objects (a list of)
 656         is necessary.
 657         """
 658         for t in tags:
 659             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 660                 lst = self.book_tags.get(t.category, [])
 661                 lst.append(t)
 662                 self.book_tags[t.category] = lst
 663             if t.category in ['theme']:
 664                 self.part_tags.append(t)
 665
 666     def tag_filter(self, tags, field='tags'):
 667         """
 668         Given a lsit of tags and an optional field (but they are normally in tags field)
 669         returns a filter accepting only books with specific tags.
 670         """
 671         q = BooleanQuery()
 672
 673         for tag in tags:
 674             toks = self.search.get_tokens(tag.name, field=field)
 675             tag_phrase = PhraseQuery()
 676             for tok in toks:
 677                 tag_phrase.add(Term(field, tok))
 678             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 679
 680         return QueryWrapperFilter(q)
 681
 682     def book_filter(self):
 683         """
 684         Filters using book tags (all tag kinds except a theme)
 685         """
 686         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 687         if tags:
 688             return self.tag_filter(tags)
 689         else:
 690             return None
 691
 692     def part_filter(self):
 693         """
 694         This filter can be used to look for book parts.
 695         It filters on book id and/or themes.
 696         """
 697         fs = []
 698         if self.part_tags:
 699             fs.append(self.tag_filter(self.part_tags, field='themes'))
 700
 701         if self._books != []:
 702             bf = BooleanFilter()
 703             for b in self._books:
 704                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 705                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 706             fs.append(bf)
 707
 708         return Search.chain_filters(fs)
 709
 710     def should_search_for_book(self):
 711         return self._books == []
 712
 713     def just_search_in(self, all):
 714         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 715         some = []
 716         for field in all:
 717             if field == 'authors' and 'author' in self.book_tags:
 718                 continue
 719             if field == 'title' and self._books != []:
 720                 continue
 721             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 722                 continue
 723             some.append(field)
 724         return some
 725
 726
 727 class Search(IndexStore):
 728     """
 729     Search facilities.
 730     """
 731     def __init__(self, default_field="content"):
 732         IndexStore.__init__(self)
 733         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 734         # self.analyzer = WLAnalyzer()
 735         self.searcher = IndexSearcher(self.store, True)
 736         self.parser = QueryParser(Version.LUCENE_34, default_field,
 737                                   self.analyzer)
 738
 739         self.parent_filter = TermsFilter()
 740         self.parent_filter.addTerm(Term("is_book", "true"))
 741
 742     def query(self, query):
 743         """Parse query in default Lucene Syntax. (for humans)
 744         """
 745         return self.parser.parse(query)
 746
 747     def simple_search(self, query, max_results=50):
 748         """Runs a query for books using lucene syntax. (for humans)
 749         Returns (books, total_hits)
 750         """
 751
 752         tops = self.searcher.search(self.query(query), max_results)
 753         bks = []
 754         for found in tops.scoreDocs:
 755             doc = self.searcher.doc(found.doc)
 756             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 757         return (bks, tops.totalHits)
 758
 759     def get_tokens(self, searched, field='content'):
 760         """returns tokens analyzed by a proper (for a field) analyzer
 761         argument can be: StringReader, string/unicode, or tokens. In the last case
 762         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 763         """
 764         if isinstance(searched, str) or isinstance(searched, unicode):
 765             searched = StringReader(searched)
 766         elif isinstance(searched, list):
 767             return searched
 768
 769         searched.reset()
 770         tokens = self.analyzer.reusableTokenStream(field, searched)
 771         toks = []
 772         while tokens.incrementToken():
 773             cta = tokens.getAttribute(CharTermAttribute.class_)
 774             toks.append(cta.toString())
 775         return toks
 776
 777     def fuzziness(self, fuzzy):
 778         """Helper method to sanitize fuzziness"""
 779         if not fuzzy:
 780             return None
 781         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 782             return fuzzy
 783         else:
 784             return 0.5
 785
 786     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 787         """
 788         Return a PhraseQuery with a series of tokens.
 789         """
 790         if fuzzy:
 791             phrase = MultiPhraseQuery()
 792             for t in tokens:
 793                 term = Term(field, t)
 794                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 795                 fuzzterms = []
 796
 797                 while True:
 798                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 799                     ft = fuzzterm.term()
 800                     if ft:
 801                         fuzzterms.append(ft)
 802                     if not fuzzterm.next(): break
 803                 if fuzzterms:
 804                     phrase.add(JArray('object')(fuzzterms, Term))
 805                 else:
 806                     phrase.add(term)
 807         else:
 808             phrase = PhraseQuery()
 809             phrase.setSlop(slop)
 810             for t in tokens:
 811                 term = Term(field, t)
 812                 phrase.add(term)
 813         return phrase
 814
 815     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 816         """
 817         Returns term queries joined by boolean query.
 818         modal - applies to boolean query
 819         fuzzy - should the query by fuzzy.
 820         """
 821         q = BooleanQuery()
 822         for t in tokens:
 823             term = Term(field, t)
 824             if fuzzy:
 825                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 826             else:
 827                 term = TermQuery(term)
 828             q.add(BooleanClause(term, modal))
 829         return q
 830
 831     # def content_query(self, query):
 832     #     return BlockJoinQuery(query, self.parent_filter,
 833     #                           BlockJoinQuery.ScoreMode.Total)
 834
 835     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 836         """
 837         Search for perfect book matches. Just see if the query matches with some author or title,
 838         taking hints into account.
 839         """
 840         fields_to_search = ['authors', 'title']
 841         only_in = None
 842         if hint:
 843             if not hint.should_search_for_book():
 844                 return []
 845             fields_to_search = hint.just_search_in(fields_to_search)
 846             only_in = hint.book_filter()
 847
 848         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 849
 850         books = []
 851         for q in qrys:
 852             top = self.searcher.search(q,
 853                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 854                 max_results)
 855             for found in top.scoreDocs:
 856                 books.append(SearchResult(self.searcher, found))
 857         return books
 858
 859     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 860         """
 861         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
 862         some part/fragment of the book.
 863         """
 864         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 865
 866         flt = None
 867         if hint:
 868             flt = hint.part_filter()
 869
 870         books = []
 871         for q in qrys:
 872             top = self.searcher.search(q,
 873                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 874                                                            flt]),
 875                                        max_results)
 876             for found in top.scoreDocs:
 877                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
 878
 879         return books
 880
 881     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
 882         """
 883         Tries to use search terms to match different fields of book (or its parts).
 884         E.g. one word can be an author survey, another be a part of the title, and the rest
 885         are some words from third chapter.
 886         """
 887         books = []
 888         only_in = None
 889
 890         if hint:
 891             only_in = hint.part_filter()
 892
 893         # content only query : themes x content
 894         q = BooleanQuery()
 895
 896         tokens = self.get_tokens(searched)
 897         if hint is None or hint.just_search_in(['themes_pl']) != []:
 898             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
 899                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 900
 901         q.add(BooleanClause(self.make_term_query(tokens, field='content',
 902                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 903
 904         topDocs = self.searcher.search(q, only_in, max_results)
 905         for found in topDocs.scoreDocs:
 906             books.append(SearchResult(self.searcher, found))
 907
 908         # query themes/content x author/title/tags
 909         q = BooleanQuery()
 910         #        in_meta = BooleanQuery()
 911         in_content = BooleanQuery()
 912
 913         for fld in ['themes', 'content', 'tags', 'authors', 'title']:
 914             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 915
 916         topDocs = self.searcher.search(q, only_in, max_results)
 917         for found in topDocs.scoreDocs:
 918             books.append(SearchResult(self.searcher, found))
 919
 920         return books
 921
 922     # def multisearch(self, query, max_results=50):
 923     #     """
 924     #     Search strategy:
 925     #     - (phrase) OR -> content
 926     #                   -> title
 927     #                   -> authors
 928     #     - (keywords)  -> authors
 929     #                   -> motyw
 930     #                   -> tags
 931     #                   -> content
 932     #     """
 933         # queryreader = StringReader(query)
 934         # tokens = self.get_tokens(queryreader)
 935
 936         # top_level = BooleanQuery()
 937         # Should = BooleanClause.Occur.SHOULD
 938
 939         # phrase_level = BooleanQuery()
 940         # phrase_level.setBoost(1.3)
 941
 942         # p_content = self.make_phrase(tokens, joined=True)
 943         # p_title = self.make_phrase(tokens, 'title')
 944         # p_author = self.make_phrase(tokens, 'author')
 945
 946         # phrase_level.add(BooleanClause(p_content, Should))
 947         # phrase_level.add(BooleanClause(p_title, Should))
 948         # phrase_level.add(BooleanClause(p_author, Should))
 949
 950         # kw_level = BooleanQuery()
 951
 952         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
 953         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
 954         # kw_level.add(j_themes, Should)
 955         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 956         # j_con = self.make_term_query(tokens, joined=True)
 957         # kw_level.add(j_con, Should)
 958
 959         # top_level.add(BooleanClause(phrase_level, Should))
 960         # top_level.add(BooleanClause(kw_level, Should))
 961
 962         # return None
 963
 964
 965     def get_snippets(self, scoreDoc, query, field='content'):
 966         """
 967         Returns a snippet for found scoreDoc.
 968         """
 969         htmlFormatter = SimpleHTMLFormatter()
 970         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
 971
 972         stored = self.searcher.doc(scoreDoc.doc)
 973
 974         # locate content.
 975         snippets = Snippets(stored.get('book_id')).open()
 976         try:
 977             text = snippets.get((int(stored.get('snippets_position')),
 978                                  int(stored.get('snippets_length'))))
 979         finally:
 980             snippets.close()
 981
 982         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
 983         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
 984         #        import pdb; pdb.set_trace()
 985         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 986
 987         return snip
 988
 989     @staticmethod
 990     def enum_to_array(enum):
 991         """
 992         Converts a lucene TermEnum to array of Terms, suitable for
 993         addition to queries
 994         """
 995         terms = []
 996
 997         while True:
 998             t = enum.term()
 999             if t:
1000                 terms.append(t)
1001             if not enum.next(): break
1002
1003         if terms:
1004             return JArray('object')(terms, Term)
1005
1006     def search_tags(self, query, filter=None, max_results=40):
1007         """
1008         Search for Tag objects using query.
1009         """
1010         tops = self.searcher.search(query, filter, max_results)
1011
1012         tags = []
1013         for found in tops.scoreDocs:
1014             doc = self.searcher.doc(found.doc)
1015             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1016             tags.append(tag)
1017             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1018
1019         return tags
1020
1021     def search_books(self, query, filter=None, max_results=10):
1022         """
1023         Searches for Book objects using query
1024         """
1025         bks = []
1026         tops = self.searcher.search(query, filter, max_results)
1027         for found in tops.scoreDocs:
1028             doc = self.searcher.doc(found.doc)
1029             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1030         return bks
1031
1032     def create_prefix_phrase(self, toks, field):
1033         q = MultiPhraseQuery()
1034         for i in range(len(toks)):
1035             t = Term(field, toks[i])
1036             if i == len(toks) - 1:
1037                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1038                 if pterms:
1039                     q.add(pterms)
1040                 else:
1041                     q.add(t)
1042             else:
1043                 q.add(t)
1044         return q
1045
1046     @staticmethod
1047     def term_filter(term, inverse=False):
1048         only_term = TermsFilter()
1049         only_term.addTerm(term)
1050
1051         if inverse:
1052             neg = BooleanFilter()
1053             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1054             only_term = neg
1055
1056         return only_term
1057
1058     def hint_tags(self, string, max_results=50):
1059         """
1060         Return auto-complete hints for tags
1061         using prefix search.
1062         """
1063         toks = self.get_tokens(string, field='SIMPLE')
1064         top = BooleanQuery()
1065
1066         for field in ['tag_name', 'tag_name_pl']:
1067             q = self.create_prefix_phrase(toks, field)
1068             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1069
1070         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1071
1072         return self.search_tags(top, no_book_cat, max_results=max_results)
1073
1074     def hint_books(self, string, max_results=50):
1075         """
1076         Returns auto-complete hints for book titles
1077         Because we do not index 'pseudo' title-tags.
1078         Prefix search.
1079         """
1080         toks = self.get_tokens(string, field='SIMPLE')
1081
1082         q = self.create_prefix_phrase(toks, 'title')
1083
1084         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1085
1086     @staticmethod
1087     def chain_filters(filters, op=ChainedFilter.AND):
1088         """
1089         Chains a filter list together
1090         """
1091         filters = filter(lambda x: x is not None, filters)
1092         if not filters:
1093             return None
1094         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1095         return chf
1096
1097     def filtered_categories(self, tags):
1098         """
1099         Return a list of tag categories, present in tags list.
1100         """
1101         cats = {}
1102         for t in tags:
1103             cats[t.category] = True
1104         return cats.keys()
1105
1106     def hint(self):
1107         return Hint(self)