apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 import catalogue.models
  29 from multiprocessing.pool import ThreadPool
  30 from threading import current_thread
  31 import atexit
  32 import traceback
  33
  34
  35 class WLAnalyzer(PerFieldAnalyzerWrapper):
  36     def __init__(self):
  37         polish = PolishAnalyzer(Version.LUCENE_34)
  38         #        polish_gap.setPositionIncrementGap(999)
  39
  40         simple = SimpleAnalyzer(Version.LUCENE_34)
  41         #        simple_gap.setPositionIncrementGap(999)
  42
  43         keyword = KeywordAnalyzer(Version.LUCENE_34)
  44
  45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  46
  47         PerFieldAnalyzerWrapper.__init__(self, polish)
  48
  49         self.addAnalyzer("tags", simple)
  50         self.addAnalyzer("technical_editors", simple)
  51         self.addAnalyzer("editors", simple)
  52         self.addAnalyzer("url", keyword)
  53         self.addAnalyzer("source_url", keyword)
  54         self.addAnalyzer("source_name", simple)
  55         self.addAnalyzer("publisher", simple)
  56         self.addAnalyzer("authors", simple)
  57         self.addAnalyzer("title", simple)
  58
  59         self.addAnalyzer("is_book", keyword)
  60         # shouldn't the title have two forms? _pl and simple?
  61
  62         self.addAnalyzer("themes", simple)
  63         self.addAnalyzer("themes_pl", polish)
  64
  65         self.addAnalyzer("tag_name", simple)
  66         self.addAnalyzer("tag_name_pl", polish)
  67
  68         self.addAnalyzer("translators", simple)
  69
  70         self.addAnalyzer("KEYWORD", keyword)
  71         self.addAnalyzer("SIMPLE", simple)
  72         self.addAnalyzer("POLISH", polish)
  73
  74
  75 class IndexStore(object):
  76     """
  77     Provides access to search index.
  78
  79     self.store - lucene index directory
  80     """
  81     def __init__(self):
  82         self.make_index_dir()
  83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  84
  85     def make_index_dir(self):
  86         try:
  87             os.makedirs(settings.SEARCH_INDEX)
  88         except OSError as exc:
  89             if exc.errno == errno.EEXIST:
  90                 pass
  91             else: raise
  92
  93
  94 class IndexChecker(IndexStore):
  95     def __init__(self):
  96         IndexStore.__init__(self)
  97
  98     def check(self):
  99         checker = CheckIndex(self.store)
 100         status = checker.checkIndex()
 101         return status
 102
 103
 104 class Snippets(object):
 105     """
 106     This class manages snippet files for indexed object (book)
 107     the snippets are concatenated together, and their positions and
 108     lengths are kept in lucene index fields.
 109     """
 110     SNIPPET_DIR = "snippets"
 111
 112     def __init__(self, book_id):
 113         try:
 114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 115         except OSError as exc:
 116             if exc.errno == errno.EEXIST:
 117                 pass
 118             else: raise
 119         self.book_id = book_id
 120         self.file = None
 121
 122     def open(self, mode='r'):
 123         """
 124         Open the snippet file. Call .close() afterwards.
 125         """
 126         if not 'b' in mode:
 127             mode += 'b'
 128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 129         self.position = 0
 130         return self
 131
 132     def add(self, snippet):
 133         """
 134         Append a snippet (unicode) to the snippet file.
 135         Return a (position, length) tuple
 136         """
 137         txt = snippet.encode('utf-8')
 138         l = len(txt)
 139         self.file.write(txt)
 140         pos = (self.position, l)
 141         self.position += l
 142         print "SSSS %s - %s" % (pos, txt)
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad'
 268         ]
 269
 270     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 271
 272     def extract_metadata(self, book, book_info=None):
 273         """
 274         Extract metadata from book and returns a map of fields keyed by fieldname
 275         """
 276         fields = {}
 277
 278         if book_info is None:
 279             book_info = dcparser.parse(open(book.xml_file.path))
 280
 281         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 282         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 283         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 284
 285         # validator, name
 286         for field in dcparser.BookInfo.FIELDS:
 287             if hasattr(book_info, field.name):
 288                 if not getattr(book_info, field.name):
 289                     continue
 290                 # since no type information is available, we use validator
 291                 type_indicator = field.validator
 292                 if type_indicator == dcparser.as_unicode:
 293                     s = getattr(book_info, field.name)
 294                     if field.multiple:
 295                         s = ', '.join(s)
 296                     try:
 297                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 298                     except JavaError as je:
 299                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 300                 elif type_indicator == dcparser.as_person:
 301                     p = getattr(book_info, field.name)
 302                     if isinstance(p, dcparser.Person):
 303                         persons = unicode(p)
 304                     else:
 305                         persons = ', '.join(map(unicode, p))
 306                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 307                 elif type_indicator == dcparser.as_date:
 308                     dt = getattr(book_info, field.name)
 309                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 310                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 311
 312         return fields
 313
 314     def add_gaps(self, fields, fieldname):
 315         """
 316         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 317         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 318         """
 319         def gap():
 320             while True:
 321                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 322         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 323
 324     def get_master(self, root):
 325         """
 326         Returns the first master tag from an etree.
 327         """
 328         for master in root.iter():
 329             if master.tag in self.master_tags:
 330                 return master
 331
 332     def index_content(self, book, book_fields=[]):
 333         """
 334         Walks the book XML and extract content from it.
 335         Adds parts for each header tag and for each fragment.
 336         """
 337         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 338         root = wld.edoc.getroot()
 339
 340         master = self.get_master(root)
 341         if master is None:
 342             return []
 343
 344         def walker(node):
 345             yield node, None
 346             for child in list(node):
 347                 for b, e in walker(child):
 348                     yield b, e
 349             yield None, node
 350             return
 351
 352         def fix_format(text):
 353             return re.sub("(?m)/$", "", text)
 354
 355         def add_part(snippets, **fields):
 356             doc = self.create_book_doc(book)
 357             for f in book_fields:
 358                 doc.add(f)
 359
 360             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 361             doc.add(NumericField("header_span", Field.Store.YES, True)\
 362                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 363             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 364
 365             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 366                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 367
 368             snip_pos = snippets.add(fields["content"])
 369             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 370             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 371
 372             if 'fragment_anchor' in fields:
 373                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 374                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 375
 376             if 'themes' in fields:
 377                 themes, themes_pl = zip(*[
 378                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 379                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 380                      for theme in fields['themes']])
 381
 382                 themes = self.add_gaps(themes, 'themes')
 383                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 384
 385                 for t in themes:
 386                     doc.add(t)
 387                 for t in themes_pl:
 388                     doc.add(t)
 389
 390             return doc
 391
 392         def give_me_utf8(s):
 393             if isinstance(s, unicode):
 394                 return s.encode('utf-8')
 395             else:
 396                 return s
 397
 398         fragments = {}
 399         snippets = Snippets(book.id).open('w')
 400         try:
 401             for header, position in zip(list(master), range(len(master))):
 402
 403                 if header.tag in self.skip_header_tags:
 404                     continue
 405
 406                 # section content
 407                 content = []
 408
 409                 for start, end in walker(header):
 410                         # handle fragments and themes.
 411                     if start is not None and start.tag == 'begin':
 412                         fid = start.attrib['id'][1:]
 413                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 414
 415                     elif start is not None and start.tag == 'motyw':
 416                         fid = start.attrib['id'][1:]
 417                         if start.text is not None:
 418                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 419
 420                     elif start is not None and start.tag == 'end':
 421                         fid = start.attrib['id'][1:]
 422                         if fid not in fragments:
 423                             continue  # a broken <end> node, skip it
 424                                       #                        import pdb; pdb.set_trace()
 425                         frag = fragments[fid]
 426                         if frag['themes'] == []:
 427                             continue  # empty themes list.
 428                         del fragments[fid]
 429
 430                         def jstr(l):
 431                             return u' '.join(map(
 432                                 lambda x: x == None and u'(none)' or unicode(x),
 433                                 l))
 434
 435                         doc = add_part(snippets,
 436                                        header_type=frag['start_header'],
 437                                        header_index=frag['start_section'],
 438                                        header_span=position - frag['start_section'] + 1,
 439                                        fragment_anchor=fid,
 440                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 441                                        themes=frag['themes'])
 442
 443                         self.index.addDocument(doc)
 444
 445                         # Collect content.
 446                     elif start is not None:
 447                         for frag in fragments.values():
 448                             frag['content'].append(start.text)
 449                         content.append(start.text)
 450                     elif end is not None:
 451                         for frag in fragments.values():
 452                             frag['content'].append(end.tail)
 453                         content.append(end.tail)
 454
 455                         # in the end, add a section text.
 456                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 457                                content=fix_format(u' '.join(filter(lambda s: s is not None, frag['content']))))
 458
 459                 self.index.addDocument(doc)
 460
 461         finally:
 462             snippets.close()
 463
 464
 465 def log_exception_wrapper(f):
 466     def _wrap(*a):
 467         try:
 468             f(*a)
 469         except Exception, e:
 470             print("Error in indexing thread: %s" % e)
 471             traceback.print_exc()
 472             raise e
 473     return _wrap
 474
 475
 476 class ReusableIndex(Index):
 477     """
 478     Works like index, but does not close/optimize Lucene index
 479     until program exit (uses atexit hook).
 480     This is usefull for importbooks command.
 481
 482     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 483     """
 484     index = None
 485
 486     def open(self, analyzer=None, threads=4):
 487         if ReusableIndex.index is not None:
 488             self.index = ReusableIndex.index
 489         else:
 490             print("opening index")
 491             Index.open(self, analyzer)
 492             ReusableIndex.index = self.index
 493             atexit.register(ReusableIndex.close_reusable)
 494
 495     # def index_book(self, *args, **kw):
 496     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 497     #     ReusableIndex.pool_jobs.append(job)
 498
 499     @staticmethod
 500     def close_reusable():
 501         if ReusableIndex.index is not None:
 502             ReusableIndex.index.optimize()
 503             ReusableIndex.index.close()
 504             ReusableIndex.index = None
 505
 506     def close(self):
 507         pass
 508
 509
 510 class JoinSearch(object):
 511     """
 512     This mixin could be used to handle block join queries.
 513     (currently unused)
 514     """
 515     def __init__(self, *args, **kw):
 516         super(JoinSearch, self).__init__(*args, **kw)
 517
 518     def wrapjoins(self, query, fields=[]):
 519         """
 520         This functions modifies the query in a recursive way,
 521         so Term and Phrase Queries contained, which match
 522         provided fields are wrapped in a BlockJoinQuery,
 523         and so delegated to children documents.
 524         """
 525         if BooleanQuery.instance_(query):
 526             qs = BooleanQuery.cast_(query)
 527             for clause in qs:
 528                 clause = BooleanClause.cast_(clause)
 529                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 530             return qs
 531         else:
 532             termset = HashSet()
 533             query.extractTerms(termset)
 534             for t in termset:
 535                 t = Term.cast_(t)
 536                 if t.field() not in fields:
 537                     return query
 538             return BlockJoinQuery(query, self.parent_filter,
 539                                   BlockJoinQuery.ScoreMode.Total)
 540
 541     def bsearch(self, query, max_results=50):
 542         q = self.query(query)
 543         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 544
 545         tops = self.searcher.search(bjq, max_results)
 546         bks = []
 547         for found in tops.scoreDocs:
 548             doc = self.searcher.doc(found.doc)
 549             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 550         return (bks, tops.totalHits)
 551
 552
 553 class SearchResult(object):
 554     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
 555         if score:
 556             self.score = score
 557         else:
 558             self.score = scoreDocs.score
 559
 560         self._hits = []
 561         self.hits = None  # processed hits
 562
 563         stored = searcher.doc(scoreDocs.doc)
 564         self.book_id = int(stored.get("book_id"))
 565
 566         header_type = stored.get("header_type")
 567         if not header_type:
 568             return
 569
 570         sec = (header_type, int(stored.get("header_index")))
 571         header_span = stored.get('header_span')
 572         header_span = header_span is not None and int(header_span) or 1
 573
 574         fragment = stored.get("fragment_anchor")
 575
 576         if snippets:
 577             snippets = snippets.replace("/\n", "\n")
 578         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 579
 580         self._hits.append(hit)
 581
 582     def merge(self, other):
 583         if self.book_id != other.book_id:
 584             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 585         self._hits += other._hits
 586         if other.score > self.score:
 587             self.score = other.score
 588         return self
 589
 590     def get_book(self):
 591         return catalogue.models.Book.objects.get(id=self.book_id)
 592
 593     book = property(get_book)
 594
 595     def process_hits(self):
 596         POSITION = 0
 597         FRAGMENT = 1
 598         POSITION_INDEX = 1
 599         POSITION_SPAN = 2
 600         SCORE = 2
 601         OTHER = 3
 602
 603         # to sections and fragments
 604         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 605         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 606         sect = filter(lambda s: 0 == len(filter(
 607             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 608             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 609             frags)), sect)
 610
 611         hits = []
 612
 613         # remove duplicate fragments
 614         fragments = {}
 615         for f in frags:
 616             fid = f[FRAGMENT]
 617             if fid in fragments:
 618                 if fragments[fid][SCORE] >= f[SCORE]:
 619                     continue
 620             fragments[fid] = f
 621         frags = fragments.values()
 622
 623         # remove duplicate sections
 624         sections = {}
 625
 626         for s in sect:
 627             si = s[POSITION][POSITION_INDEX]
 628             # skip existing
 629             if si in sections:
 630                 if sections[si]['score'] >= s[SCORE]:
 631                     continue
 632
 633             m = {'score': s[SCORE],
 634                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 635                  }
 636             m.update(s[OTHER])
 637             sections[si] = m
 638
 639         hits = sections.values()
 640
 641         for f in frags:
 642             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 643             m = {'score': f[SCORE],
 644                  'fragment': frag,
 645                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 646                  'themes': frag.tags.filter(category='theme')
 647                  }
 648             m.update(f[OTHER])
 649             hits.append(m)
 650
 651         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 652
 653         self.hits = hits
 654
 655         return self
 656
 657     def __unicode__(self):
 658         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 659
 660     @staticmethod
 661     def aggregate(*result_lists):
 662         books = {}
 663         for rl in result_lists:
 664             for r in rl:
 665                 if r.book_id in books:
 666                     books[r.book_id].merge(r)
 667                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 668                 else:
 669                     books[r.book_id] = r
 670         return books.values()
 671
 672     def __cmp__(self, other):
 673         return cmp(self.score, other.score)
 674
 675
 676 class Hint(object):
 677     """
 678     Given some hint information (information we already know about)
 679     our search target - like author, title (specific book), epoch, genre, kind
 680     we can narrow down search using filters.
 681     """
 682     def __init__(self, search):
 683         """
 684         Accepts a Searcher instance.
 685         """
 686         self.search = search
 687         self.book_tags = {}
 688         self.part_tags = []
 689         self._books = []
 690
 691     def books(self, *books):
 692         """
 693         Give a hint that we search these books.
 694         """
 695         self._books = books
 696
 697     def tags(self, tags):
 698         """
 699         Give a hint that these Tag objects (a list of)
 700         is necessary.
 701         """
 702         for t in tags:
 703             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 704                 lst = self.book_tags.get(t.category, [])
 705                 lst.append(t)
 706                 self.book_tags[t.category] = lst
 707             if t.category in ['theme', 'theme_pl']:
 708                 self.part_tags.append(t)
 709
 710     def tag_filter(self, tags, field='tags'):
 711         """
 712         Given a lsit of tags and an optional field (but they are normally in tags field)
 713         returns a filter accepting only books with specific tags.
 714         """
 715         q = BooleanQuery()
 716
 717         for tag in tags:
 718             toks = self.search.get_tokens(tag.name, field=field)
 719             tag_phrase = PhraseQuery()
 720             for tok in toks:
 721                 tag_phrase.add(Term(field, tok))
 722             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 723
 724         return QueryWrapperFilter(q)
 725
 726     def book_filter(self):
 727         """
 728         Filters using book tags (all tag kinds except a theme)
 729         """
 730         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 731         if tags:
 732             return self.tag_filter(tags)
 733         else:
 734             return None
 735
 736     def part_filter(self):
 737         """
 738         This filter can be used to look for book parts.
 739         It filters on book id and/or themes.
 740         """
 741         fs = []
 742         if self.part_tags:
 743             fs.append(self.tag_filter(self.part_tags, field='themes'))
 744
 745         if self._books != []:
 746             bf = BooleanFilter()
 747             for b in self._books:
 748                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 749                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 750             fs.append(bf)
 751
 752         return Search.chain_filters(fs)
 753
 754     def should_search_for_book(self):
 755         return self._books == []
 756
 757     def just_search_in(self, all):
 758         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 759         some = []
 760         for field in all:
 761             if field == 'authors' and 'author' in self.book_tags:
 762                 continue
 763             if field == 'title' and self._books != []:
 764                 continue
 765             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 766                 continue
 767             some.append(field)
 768         return some
 769
 770
 771 class Search(IndexStore):
 772     """
 773     Search facilities.
 774     """
 775     def __init__(self, default_field="content"):
 776         IndexStore.__init__(self)
 777         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 778         # self.analyzer = WLAnalyzer()
 779         self.searcher = IndexSearcher(self.store, True)
 780         self.parser = QueryParser(Version.LUCENE_34, default_field,
 781                                   self.analyzer)
 782
 783         self.parent_filter = TermsFilter()
 784         self.parent_filter.addTerm(Term("is_book", "true"))
 785
 786     def query(self, query):
 787         """Parse query in default Lucene Syntax. (for humans)
 788         """
 789         return self.parser.parse(query)
 790
 791     def simple_search(self, query, max_results=50):
 792         """Runs a query for books using lucene syntax. (for humans)
 793         Returns (books, total_hits)
 794         """
 795
 796         tops = self.searcher.search(self.query(query), max_results)
 797         bks = []
 798         for found in tops.scoreDocs:
 799             doc = self.searcher.doc(found.doc)
 800             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 801         return (bks, tops.totalHits)
 802
 803     def get_tokens(self, searched, field='content'):
 804         """returns tokens analyzed by a proper (for a field) analyzer
 805         argument can be: StringReader, string/unicode, or tokens. In the last case
 806         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 807         """
 808         if isinstance(searched, str) or isinstance(searched, unicode):
 809             searched = StringReader(searched)
 810         elif isinstance(searched, list):
 811             return searched
 812
 813         searched.reset()
 814         tokens = self.analyzer.reusableTokenStream(field, searched)
 815         toks = []
 816         while tokens.incrementToken():
 817             cta = tokens.getAttribute(CharTermAttribute.class_)
 818             toks.append(cta.toString())
 819         return toks
 820
 821     def fuzziness(self, fuzzy):
 822         """Helper method to sanitize fuzziness"""
 823         if not fuzzy:
 824             return None
 825         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 826             return fuzzy
 827         else:
 828             return 0.5
 829
 830     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 831         """
 832         Return a PhraseQuery with a series of tokens.
 833         """
 834         if fuzzy:
 835             phrase = MultiPhraseQuery()
 836             for t in tokens:
 837                 term = Term(field, t)
 838                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 839                 fuzzterms = []
 840
 841                 while True:
 842                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 843                     ft = fuzzterm.term()
 844                     if ft:
 845                         fuzzterms.append(ft)
 846                     if not fuzzterm.next(): break
 847                 if fuzzterms:
 848                     phrase.add(JArray('object')(fuzzterms, Term))
 849                 else:
 850                     phrase.add(term)
 851         else:
 852             phrase = PhraseQuery()
 853             phrase.setSlop(slop)
 854             for t in tokens:
 855                 term = Term(field, t)
 856                 phrase.add(term)
 857         return phrase
 858
 859     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 860         """
 861         Returns term queries joined by boolean query.
 862         modal - applies to boolean query
 863         fuzzy - should the query by fuzzy.
 864         """
 865         q = BooleanQuery()
 866         for t in tokens:
 867             term = Term(field, t)
 868             if fuzzy:
 869                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 870             else:
 871                 term = TermQuery(term)
 872             q.add(BooleanClause(term, modal))
 873         return q
 874
 875     # def content_query(self, query):
 876     #     return BlockJoinQuery(query, self.parent_filter,
 877     #                           BlockJoinQuery.ScoreMode.Total)
 878
 879     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 880         """
 881         Search for perfect book matches. Just see if the query matches with some author or title,
 882         taking hints into account.
 883         """
 884         fields_to_search = ['authors', 'title']
 885         only_in = None
 886         if hint:
 887             if not hint.should_search_for_book():
 888                 return []
 889             fields_to_search = hint.just_search_in(fields_to_search)
 890             only_in = hint.book_filter()
 891
 892         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 893
 894         books = []
 895         for q in qrys:
 896             top = self.searcher.search(q,
 897                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 898                 max_results)
 899             for found in top.scoreDocs:
 900                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
 901         return books
 902
 903     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 904         fields_to_search = ['tags', 'authors', 'title']
 905
 906         only_in = None
 907         if hint:
 908             if not hint.should_search_for_book():
 909                 return []
 910             fields_to_search = hint.just_search_in(fields_to_search)
 911             only_in = hint.book_filter()
 912
 913         tokens = self.get_tokens(searched, field='SIMPLE')
 914
 915         q = BooleanQuery()
 916
 917         for fld in fields_to_search:
 918             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 919                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 920
 921         books = []
 922         top = self.searcher.search(q,
 923                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 924             max_results)
 925         for found in top.scoreDocs:
 926             books.append(SearchResult(self.searcher, found, how_found="search_book"))
 927
 928         return books
 929
 930     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 931         """
 932         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
 933         some part/fragment of the book.
 934         """
 935         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 936
 937         flt = None
 938         if hint:
 939             flt = hint.part_filter()
 940
 941         books = []
 942         for q in qrys:
 943             top = self.searcher.search(q,
 944                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 945                                                            flt]),
 946                                        max_results)
 947             for found in top.scoreDocs:
 948                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 949
 950         return books
 951
 952     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
 953         """
 954         Tries to use search terms to match different fields of book (or its parts).
 955         E.g. one word can be an author survey, another be a part of the title, and the rest
 956         are some words from third chapter.
 957         """
 958         books = []
 959         only_in = None
 960
 961         if hint:
 962             only_in = hint.part_filter()
 963
 964         # content only query : themes x content
 965         q = BooleanQuery()
 966
 967         tokens_pl = self.get_tokens(searched, field='content')
 968         tokens = self.get_tokens(searched, field='SIMPLE')
 969
 970         # only search in themes when we do not already filter by themes
 971         if hint is None or hint.just_search_in(['themes']) != []:
 972             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 973                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 974
 975         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 976                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 977
 978         topDocs = self.searcher.search(q, only_in, max_results)
 979         for found in topDocs.scoreDocs:
 980             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
 981             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 982
 983         # query themes/content x author/title/tags
 984         q = BooleanQuery()
 985         in_content = BooleanQuery()
 986         in_meta = BooleanQuery()
 987
 988         for fld in ['themes_pl', 'content']:
 989             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 990
 991         for fld in ['tags', 'authors', 'title']:
 992             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 993
 994         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 995         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 996
 997         topDocs = self.searcher.search(q, only_in, max_results)
 998         for found in topDocs.scoreDocs:
 999             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
1000             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1001
1002         return books
1003
1004     # def multisearch(self, query, max_results=50):
1005     #     """
1006     #     Search strategy:
1007     #     - (phrase) OR -> content
1008     #                   -> title
1009     #                   -> authors
1010     #     - (keywords)  -> authors
1011     #                   -> motyw
1012     #                   -> tags
1013     #                   -> content
1014     #     """
1015         # queryreader = StringReader(query)
1016         # tokens = self.get_tokens(queryreader)
1017
1018         # top_level = BooleanQuery()
1019         # Should = BooleanClause.Occur.SHOULD
1020
1021         # phrase_level = BooleanQuery()
1022         # phrase_level.setBoost(1.3)
1023
1024         # p_content = self.make_phrase(tokens, joined=True)
1025         # p_title = self.make_phrase(tokens, 'title')
1026         # p_author = self.make_phrase(tokens, 'author')
1027
1028         # phrase_level.add(BooleanClause(p_content, Should))
1029         # phrase_level.add(BooleanClause(p_title, Should))
1030         # phrase_level.add(BooleanClause(p_author, Should))
1031
1032         # kw_level = BooleanQuery()
1033
1034         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1035         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1036         # kw_level.add(j_themes, Should)
1037         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1038         # j_con = self.make_term_query(tokens, joined=True)
1039         # kw_level.add(j_con, Should)
1040
1041         # top_level.add(BooleanClause(phrase_level, Should))
1042         # top_level.add(BooleanClause(kw_level, Should))
1043
1044         # return None
1045
1046     def get_snippets(self, scoreDoc, query, field='content'):
1047         """
1048         Returns a snippet for found scoreDoc.
1049         """
1050         htmlFormatter = SimpleHTMLFormatter()
1051         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1052
1053         stored = self.searcher.doc(scoreDoc.doc)
1054
1055         # locate content.
1056         snippets = Snippets(stored.get('book_id')).open()
1057         try:
1058             text = snippets.get((int(stored.get('snippets_position')),
1059                                  int(stored.get('snippets_length'))))
1060         finally:
1061             snippets.close()
1062
1063         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1064         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1065         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1066
1067         return snip
1068
1069     @staticmethod
1070     def enum_to_array(enum):
1071         """
1072         Converts a lucene TermEnum to array of Terms, suitable for
1073         addition to queries
1074         """
1075         terms = []
1076
1077         while True:
1078             t = enum.term()
1079             if t:
1080                 terms.append(t)
1081             if not enum.next(): break
1082
1083         if terms:
1084             return JArray('object')(terms, Term)
1085
1086     def search_tags(self, query, filter=None, max_results=40):
1087         """
1088         Search for Tag objects using query.
1089         """
1090         tops = self.searcher.search(query, filter, max_results)
1091
1092         tags = []
1093         for found in tops.scoreDocs:
1094             doc = self.searcher.doc(found.doc)
1095             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1096             tags.append(tag)
1097             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1098
1099         return tags
1100
1101     def search_books(self, query, filter=None, max_results=10):
1102         """
1103         Searches for Book objects using query
1104         """
1105         bks = []
1106         tops = self.searcher.search(query, filter, max_results)
1107         for found in tops.scoreDocs:
1108             doc = self.searcher.doc(found.doc)
1109             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1110         return bks
1111
1112     def create_prefix_phrase(self, toks, field):
1113         q = MultiPhraseQuery()
1114         for i in range(len(toks)):
1115             t = Term(field, toks[i])
1116             if i == len(toks) - 1:
1117                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1118                 if pterms:
1119                     q.add(pterms)
1120                 else:
1121                     q.add(t)
1122             else:
1123                 q.add(t)
1124         return q
1125
1126     @staticmethod
1127     def term_filter(term, inverse=False):
1128         only_term = TermsFilter()
1129         only_term.addTerm(term)
1130
1131         if inverse:
1132             neg = BooleanFilter()
1133             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1134             only_term = neg
1135
1136         return only_term
1137
1138     def hint_tags(self, string, max_results=50):
1139         """
1140         Return auto-complete hints for tags
1141         using prefix search.
1142         """
1143         toks = self.get_tokens(string, field='SIMPLE')
1144         top = BooleanQuery()
1145
1146         for field in ['tag_name', 'tag_name_pl']:
1147             q = self.create_prefix_phrase(toks, field)
1148             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1149
1150         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1151
1152         return self.search_tags(top, no_book_cat, max_results=max_results)
1153
1154     def hint_books(self, string, max_results=50):
1155         """
1156         Returns auto-complete hints for book titles
1157         Because we do not index 'pseudo' title-tags.
1158         Prefix search.
1159         """
1160         toks = self.get_tokens(string, field='SIMPLE')
1161
1162         q = self.create_prefix_phrase(toks, 'title')
1163
1164         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1165
1166     @staticmethod
1167     def chain_filters(filters, op=ChainedFilter.AND):
1168         """
1169         Chains a filter list together
1170         """
1171         filters = filter(lambda x: x is not None, filters)
1172         if not filters:
1173             return None
1174         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1175         return chf
1176
1177     def filtered_categories(self, tags):
1178         """
1179         Return a list of tag categories, present in tags list.
1180         """
1181         cats = {}
1182         for t in tags:
1183             cats[t.category] = True
1184         return cats.keys()
1185
1186     def hint(self):
1187         return Hint(self)