apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 import catalogue.models
  29 from multiprocessing.pool import ThreadPool
  30 from threading import current_thread
  31 import atexit
  32 import traceback
  33
  34
  35 class WLAnalyzer(PerFieldAnalyzerWrapper):
  36     def __init__(self):
  37         polish = PolishAnalyzer(Version.LUCENE_34)
  38         #        polish_gap.setPositionIncrementGap(999)
  39
  40         simple = SimpleAnalyzer(Version.LUCENE_34)
  41         #        simple_gap.setPositionIncrementGap(999)
  42
  43         keyword = KeywordAnalyzer(Version.LUCENE_34)
  44
  45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  46
  47         PerFieldAnalyzerWrapper.__init__(self, polish)
  48
  49         self.addAnalyzer("tags", simple)
  50         self.addAnalyzer("technical_editors", simple)
  51         self.addAnalyzer("editors", simple)
  52         self.addAnalyzer("url", keyword)
  53         self.addAnalyzer("source_url", keyword)
  54         self.addAnalyzer("source_name", simple)
  55         self.addAnalyzer("publisher", simple)
  56         self.addAnalyzer("authors", simple)
  57         self.addAnalyzer("title", simple)
  58
  59         self.addAnalyzer("is_book", keyword)
  60         # shouldn't the title have two forms? _pl and simple?
  61
  62         self.addAnalyzer("themes", simple)
  63         self.addAnalyzer("themes_pl", polish)
  64
  65         self.addAnalyzer("tag_name", simple)
  66         self.addAnalyzer("tag_name_pl", polish)
  67
  68         self.addAnalyzer("translators", simple)
  69
  70         self.addAnalyzer("KEYWORD", keyword)
  71         self.addAnalyzer("SIMPLE", simple)
  72         self.addAnalyzer("POLISH", polish)
  73
  74
  75 class IndexStore(object):
  76     """
  77     Provides access to search index.
  78
  79     self.store - lucene index directory
  80     """
  81     def __init__(self):
  82         self.make_index_dir()
  83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  84
  85     def make_index_dir(self):
  86         try:
  87             os.makedirs(settings.SEARCH_INDEX)
  88         except OSError as exc:
  89             if exc.errno == errno.EEXIST:
  90                 pass
  91             else: raise
  92
  93
  94 class IndexChecker(IndexStore):
  95     def __init__(self):
  96         IndexStore.__init__(self)
  97
  98     def check(self):
  99         checker = CheckIndex(self.store)
 100         status = checker.checkIndex()
 101         return status
 102
 103
 104 class Snippets(object):
 105     """
 106     This class manages snippet files for indexed object (book)
 107     the snippets are concatenated together, and their positions and
 108     lengths are kept in lucene index fields.
 109     """
 110     SNIPPET_DIR = "snippets"
 111
 112     def __init__(self, book_id):
 113         try:
 114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 115         except OSError as exc:
 116             if exc.errno == errno.EEXIST:
 117                 pass
 118             else: raise
 119         self.book_id = book_id
 120         self.file = None
 121
 122     def open(self, mode='r'):
 123         """
 124         Open the snippet file. Call .close() afterwards.
 125         """
 126         if not 'b' in mode:
 127             mode += 'b'
 128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 129         self.position = 0
 130         return self
 131
 132     def add(self, snippet):
 133         """
 134         Append a snippet (unicode) to the snippet file.
 135         Return a (position, length) tuple
 136         """
 137         txt = snippet.encode('utf-8')
 138         l = len(txt)
 139         self.file.write(txt)
 140         pos = (self.position, l)
 141         self.position += l
 142         return pos
 143
 144     def get(self, pos):
 145         """
 146         Given a tuple of (position, length) return an unicode
 147         of the snippet stored there.
 148         """
 149         self.file.seek(pos[0], 0)
 150         txt = self.file.read(pos[1]).decode('utf-8')
 151         return txt
 152
 153     def close(self):
 154         """Close snippet file"""
 155         self.file.close()
 156
 157
 158 class BaseIndex(IndexStore):
 159     """
 160     Base index class.
 161     Provides basic operations on index: opening, closing, optimizing.
 162     """
 163     def __init__(self, analyzer=None):
 164         super(BaseIndex, self).__init__()
 165         self.index = None
 166         if not analyzer:
 167             analyzer = WLAnalyzer()
 168         self.analyzer = analyzer
 169
 170     def open(self, analyzer=None):
 171         if self.index:
 172             raise Exception("Index is already opened")
 173         self.index = IndexWriter(self.store, self.analyzer,\
 174                                  IndexWriter.MaxFieldLength.LIMITED)
 175         return self.index
 176
 177     def optimize(self):
 178         self.index.optimize()
 179
 180     def close(self):
 181         try:
 182             self.index.optimize()
 183         except JavaError, je:
 184             print "Error during optimize phase, check index: %s" % je
 185
 186         self.index.close()
 187         self.index = None
 188
 189     def __enter__(self):
 190         self.open()
 191         return self
 192
 193     def __exit__(self, type, value, tb):
 194         self.close()
 195
 196
 197 class Index(BaseIndex):
 198     """
 199     Class indexing books.
 200     """
 201     def __init__(self, analyzer=None):
 202         super(Index, self).__init__(analyzer)
 203
 204     def index_tags(self):
 205         """
 206         Re-index global tag list.
 207         Removes all tags from index, then index them again.
 208         Indexed fields include: id, name (with and without polish stems), category
 209         """
 210         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 211         self.index.deleteDocuments(q)
 212
 213         for tag in catalogue.models.Tag.objects.all():
 214             doc = Document()
 215             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
 216             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 217             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 219             self.index.addDocument(doc)
 220
 221     def create_book_doc(self, book):
 222         """
 223         Create a lucene document referring book id.
 224         """
 225         doc = Document()
 226         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 227         if book.parent is not None:
 228             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 229         return doc
 230
 231     def remove_book(self, book):
 232         """Removes a book from search index.
 233         book - Book instance."""
 234         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 235         self.index.deleteDocuments(q)
 236
 237     def index_book(self, book, book_info=None, overwrite=True):
 238         """
 239         Indexes the book.
 240         Creates a lucene document for extracted metadata
 241         and calls self.index_content() to index the contents of the book.
 242         """
 243         if overwrite:
 244             self.remove_book(book)
 245
 246         book_doc = self.create_book_doc(book)
 247         meta_fields = self.extract_metadata(book, book_info)
 248         for f in meta_fields.values():
 249             if isinstance(f, list) or isinstance(f, tuple):
 250                 for elem in f:
 251                     book_doc.add(elem)
 252             else:
 253                 book_doc.add(f)
 254
 255         self.index.addDocument(book_doc)
 256         del book_doc
 257
 258         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 259
 260     master_tags = [
 261         'opowiadanie',
 262         'powiesc',
 263         'dramat_wierszowany_l',
 264         'dramat_wierszowany_lp',
 265         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 266         'wywiad'
 267         ]
 268
 269     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 270
 271     def extract_metadata(self, book, book_info=None):
 272         """
 273         Extract metadata from book and returns a map of fields keyed by fieldname
 274         """
 275         fields = {}
 276
 277         if book_info is None:
 278             book_info = dcparser.parse(open(book.xml_file.path))
 279
 280         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 281         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 282         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 283
 284         # validator, name
 285         for field in dcparser.BookInfo.FIELDS:
 286             if hasattr(book_info, field.name):
 287                 if not getattr(book_info, field.name):
 288                     continue
 289                 # since no type information is available, we use validator
 290                 type_indicator = field.validator
 291                 if type_indicator == dcparser.as_unicode:
 292                     s = getattr(book_info, field.name)
 293                     if field.multiple:
 294                         s = ', '.join(s)
 295                     try:
 296                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 297                     except JavaError as je:
 298                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 299                 elif type_indicator == dcparser.as_person:
 300                     p = getattr(book_info, field.name)
 301                     if isinstance(p, dcparser.Person):
 302                         persons = unicode(p)
 303                     else:
 304                         persons = ', '.join(map(unicode, p))
 305                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 306                 elif type_indicator == dcparser.as_date:
 307                     dt = getattr(book_info, field.name)
 308                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 309                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 310
 311         return fields
 312
 313     def add_gaps(self, fields, fieldname):
 314         """
 315         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 316         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 317         """
 318         def gap():
 319             while True:
 320                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 321         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 322
 323     def get_master(self, root):
 324         """
 325         Returns the first master tag from an etree.
 326         """
 327         for master in root.iter():
 328             if master.tag in self.master_tags:
 329                 return master
 330
 331     def index_content(self, book, book_fields=[]):
 332         """
 333         Walks the book XML and extract content from it.
 334         Adds parts for each header tag and for each fragment.
 335         """
 336         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 337         root = wld.edoc.getroot()
 338
 339         master = self.get_master(root)
 340         if master is None:
 341             return []
 342
 343         def walker(node):
 344             yield node, None
 345             for child in list(node):
 346                 for b, e in walker(child):
 347                     yield b, e
 348             yield None, node
 349             return
 350
 351         def fix_format(text):
 352             return re.sub("(?m)/$", "", text)
 353
 354         def add_part(snippets, **fields):
 355             doc = self.create_book_doc(book)
 356             for f in book_fields:
 357                 doc.add(f)
 358
 359             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 360             doc.add(NumericField("header_span", Field.Store.YES, True)\
 361                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 362             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 363
 364             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 365                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 366
 367             snip_pos = snippets.add(fields["content"])
 368             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 369             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 370
 371             if 'fragment_anchor' in fields:
 372                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 373                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 374
 375             if 'themes' in fields:
 376                 themes, themes_pl = zip(*[
 377                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 378                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 379                      for theme in fields['themes']])
 380
 381                 themes = self.add_gaps(themes, 'themes')
 382                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 383
 384                 for t in themes:
 385                     doc.add(t)
 386                 for t in themes_pl:
 387                     doc.add(t)
 388
 389             return doc
 390
 391         def give_me_utf8(s):
 392             if isinstance(s, unicode):
 393                 return s.encode('utf-8')
 394             else:
 395                 return s
 396
 397         fragments = {}
 398         snippets = Snippets(book.id).open('w')
 399         try:
 400             for header, position in zip(list(master), range(len(master))):
 401
 402                 if header.tag in self.skip_header_tags:
 403                     continue
 404
 405                 content = u' '.join([t for t in header.itertext()])
 406                 content = fix_format(content)
 407
 408                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
 409
 410                 self.index.addDocument(doc)
 411
 412                 for start, end in walker(header):
 413                     if start is not None and start.tag == 'begin':
 414                         fid = start.attrib['id'][1:]
 415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 416                         fragments[fid]['content'].append(start.tail)
 417                     elif start is not None and start.tag == 'motyw':
 418                         fid = start.attrib['id'][1:]
 419                         if start.text is not None:
 420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 421                         fragments[fid]['content'].append(start.tail)
 422                     elif start is not None and start.tag == 'end':
 423                         fid = start.attrib['id'][1:]
 424                         if fid not in fragments:
 425                             continue  # a broken <end> node, skip it
 426                         frag = fragments[fid]
 427                         if frag['themes'] == []:
 428                             continue  # empty themes list.
 429                         del fragments[fid]
 430
 431                         def jstr(l):
 432                             return u' '.join(map(
 433                                 lambda x: x == None and u'(none)' or unicode(x),
 434                                 l))
 435
 436                         doc = add_part(snippets,
 437                                        header_type=frag['start_header'],
 438                                        header_index=frag['start_section'],
 439                                        header_span=position - frag['start_section'] + 1,
 440                                        fragment_anchor=fid,
 441                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 442                                        themes=frag['themes'])
 443
 444                         self.index.addDocument(doc)
 445                     elif start is not None:
 446                         for frag in fragments.values():
 447                             frag['content'].append(start.text)
 448                     elif end is not None:
 449                         for frag in fragments.values():
 450                             frag['content'].append(end.tail)
 451         finally:
 452             snippets.close()
 453
 454
 455 def log_exception_wrapper(f):
 456     def _wrap(*a):
 457         try:
 458             f(*a)
 459         except Exception, e:
 460             print("Error in indexing thread: %s" % e)
 461             traceback.print_exc()
 462             raise e
 463     return _wrap
 464
 465
 466 class ReusableIndex(Index):
 467     """
 468     Works like index, but does not close/optimize Lucene index
 469     until program exit (uses atexit hook).
 470     This is usefull for importbooks command.
 471
 472     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 473     """
 474     index = None
 475
 476     def open(self, analyzer=None, threads=4):
 477         if ReusableIndex.index is not None:
 478             self.index = ReusableIndex.index
 479         else:
 480             print("opening index")
 481             Index.open(self, analyzer)
 482             ReusableIndex.index = self.index
 483             atexit.register(ReusableIndex.close_reusable)
 484
 485     # def index_book(self, *args, **kw):
 486     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 487     #     ReusableIndex.pool_jobs.append(job)
 488
 489     @staticmethod
 490     def close_reusable():
 491         if ReusableIndex.index is not None:
 492             ReusableIndex.index.optimize()
 493             ReusableIndex.index.close()
 494             ReusableIndex.index = None
 495
 496     def close(self):
 497         pass
 498
 499
 500 class JoinSearch(object):
 501     """
 502     This mixin could be used to handle block join queries.
 503     (currently unused)
 504     """
 505     def __init__(self, *args, **kw):
 506         super(JoinSearch, self).__init__(*args, **kw)
 507
 508     def wrapjoins(self, query, fields=[]):
 509         """
 510         This functions modifies the query in a recursive way,
 511         so Term and Phrase Queries contained, which match
 512         provided fields are wrapped in a BlockJoinQuery,
 513         and so delegated to children documents.
 514         """
 515         if BooleanQuery.instance_(query):
 516             qs = BooleanQuery.cast_(query)
 517             for clause in qs:
 518                 clause = BooleanClause.cast_(clause)
 519                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 520             return qs
 521         else:
 522             termset = HashSet()
 523             query.extractTerms(termset)
 524             for t in termset:
 525                 t = Term.cast_(t)
 526                 if t.field() not in fields:
 527                     return query
 528             return BlockJoinQuery(query, self.parent_filter,
 529                                   BlockJoinQuery.ScoreMode.Total)
 530
 531     def bsearch(self, query, max_results=50):
 532         q = self.query(query)
 533         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 534
 535         tops = self.searcher.search(bjq, max_results)
 536         bks = []
 537         for found in tops.scoreDocs:
 538             doc = self.searcher.doc(found.doc)
 539             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 540         return (bks, tops.totalHits)
 541
 542
 543 class SearchResult(object):
 544     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
 545         if score:
 546             self.score = score
 547         else:
 548             self.score = scoreDocs.score
 549
 550         self._hits = []
 551         self.hits = None  # processed hits
 552
 553         stored = searcher.doc(scoreDocs.doc)
 554         self.book_id = int(stored.get("book_id"))
 555
 556         header_type = stored.get("header_type")
 557         if not header_type:
 558             return
 559
 560         sec = (header_type, int(stored.get("header_index")))
 561         header_span = stored.get('header_span')
 562         header_span = header_span is not None and int(header_span) or 1
 563
 564         fragment = stored.get("fragment_anchor")
 565
 566         if snippets:
 567             snippets = snippets.replace("/\n", "\n")
 568         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 569
 570         self._hits.append(hit)
 571
 572     def merge(self, other):
 573         if self.book_id != other.book_id:
 574             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 575         self._hits += other._hits
 576         if other.score > self.score:
 577             self.score = other.score
 578         return self
 579
 580     def get_book(self):
 581         return catalogue.models.Book.objects.get(id=self.book_id)
 582
 583     book = property(get_book)
 584
 585     def process_hits(self):
 586         POSITION = 0
 587         FRAGMENT = 1
 588         POSITION_INDEX = 1
 589         POSITION_SPAN = 2
 590         SCORE = 2
 591         OTHER = 3
 592
 593         # to sections and fragments
 594         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 595         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 596         sect = filter(lambda s: 0 == len(filter(
 597             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 598             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 599             frags)), sect)
 600
 601         hits = []
 602
 603         # remove duplicate fragments
 604         fragments = {}
 605         for f in frags:
 606             fid = f[FRAGMENT]
 607             if fid in fragments:
 608                 if fragments[fid][SCORE] >= f[SCORE]:
 609                     continue
 610             fragments[fid] = f
 611         frags = fragments.values()
 612
 613         # remove duplicate sections
 614         sections = {}
 615
 616         for s in sect:
 617             si = s[POSITION][POSITION_INDEX]
 618             # skip existing
 619             if si in sections:
 620                 if sections[si]['score'] >= s[SCORE]:
 621                     continue
 622
 623             m = {'score': s[SCORE],
 624                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 625                  }
 626             m.update(s[OTHER])
 627             sections[si] = m
 628
 629         hits = sections.values()
 630
 631         for f in frags:
 632             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 633             m = {'score': f[SCORE],
 634                  'fragment': frag,
 635                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 636                  'themes': frag.tags.filter(category='theme')
 637                  }
 638             m.update(f[OTHER])
 639             hits.append(m)
 640
 641         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 642
 643         self.hits = hits
 644
 645         return self
 646
 647     def __unicode__(self):
 648         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 649
 650     @staticmethod
 651     def aggregate(*result_lists):
 652         books = {}
 653         for rl in result_lists:
 654             for r in rl:
 655                 if r.book_id in books:
 656                     books[r.book_id].merge(r)
 657                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 658                 else:
 659                     books[r.book_id] = r
 660         return books.values()
 661
 662     def __cmp__(self, other):
 663         return cmp(self.score, other.score)
 664
 665
 666 class Hint(object):
 667     """
 668     Given some hint information (information we already know about)
 669     our search target - like author, title (specific book), epoch, genre, kind
 670     we can narrow down search using filters.
 671     """
 672     def __init__(self, search):
 673         """
 674         Accepts a Searcher instance.
 675         """
 676         self.search = search
 677         self.book_tags = {}
 678         self.part_tags = []
 679         self._books = []
 680
 681     def books(self, *books):
 682         """
 683         Give a hint that we search these books.
 684         """
 685         self._books = books
 686
 687     def tags(self, tags):
 688         """
 689         Give a hint that these Tag objects (a list of)
 690         is necessary.
 691         """
 692         for t in tags:
 693             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 694                 lst = self.book_tags.get(t.category, [])
 695                 lst.append(t)
 696                 self.book_tags[t.category] = lst
 697             if t.category in ['theme', 'theme_pl']:
 698                 self.part_tags.append(t)
 699
 700     def tag_filter(self, tags, field='tags'):
 701         """
 702         Given a lsit of tags and an optional field (but they are normally in tags field)
 703         returns a filter accepting only books with specific tags.
 704         """
 705         q = BooleanQuery()
 706
 707         for tag in tags:
 708             toks = self.search.get_tokens(tag.name, field=field)
 709             tag_phrase = PhraseQuery()
 710             for tok in toks:
 711                 tag_phrase.add(Term(field, tok))
 712             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 713
 714         return QueryWrapperFilter(q)
 715
 716     def book_filter(self):
 717         """
 718         Filters using book tags (all tag kinds except a theme)
 719         """
 720         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 721         if tags:
 722             return self.tag_filter(tags)
 723         else:
 724             return None
 725
 726     def part_filter(self):
 727         """
 728         This filter can be used to look for book parts.
 729         It filters on book id and/or themes.
 730         """
 731         fs = []
 732         if self.part_tags:
 733             fs.append(self.tag_filter(self.part_tags, field='themes'))
 734
 735         if self._books != []:
 736             bf = BooleanFilter()
 737             for b in self._books:
 738                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 739                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 740             fs.append(bf)
 741
 742         return Search.chain_filters(fs)
 743
 744     def should_search_for_book(self):
 745         return self._books == []
 746
 747     def just_search_in(self, all):
 748         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 749         some = []
 750         for field in all:
 751             if field == 'authors' and 'author' in self.book_tags:
 752                 continue
 753             if field == 'title' and self._books != []:
 754                 continue
 755             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 756                 continue
 757             some.append(field)
 758         return some
 759
 760
 761 class Search(IndexStore):
 762     """
 763     Search facilities.
 764     """
 765     def __init__(self, default_field="content"):
 766         IndexStore.__init__(self)
 767         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 768         # self.analyzer = WLAnalyzer()
 769         self.searcher = IndexSearcher(self.store, True)
 770         self.parser = QueryParser(Version.LUCENE_34, default_field,
 771                                   self.analyzer)
 772
 773         self.parent_filter = TermsFilter()
 774         self.parent_filter.addTerm(Term("is_book", "true"))
 775
 776     def query(self, query):
 777         """Parse query in default Lucene Syntax. (for humans)
 778         """
 779         return self.parser.parse(query)
 780
 781     def simple_search(self, query, max_results=50):
 782         """Runs a query for books using lucene syntax. (for humans)
 783         Returns (books, total_hits)
 784         """
 785
 786         tops = self.searcher.search(self.query(query), max_results)
 787         bks = []
 788         for found in tops.scoreDocs:
 789             doc = self.searcher.doc(found.doc)
 790             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 791         return (bks, tops.totalHits)
 792
 793     def get_tokens(self, searched, field='content'):
 794         """returns tokens analyzed by a proper (for a field) analyzer
 795         argument can be: StringReader, string/unicode, or tokens. In the last case
 796         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 797         """
 798         if isinstance(searched, str) or isinstance(searched, unicode):
 799             searched = StringReader(searched)
 800         elif isinstance(searched, list):
 801             return searched
 802
 803         searched.reset()
 804         tokens = self.analyzer.reusableTokenStream(field, searched)
 805         toks = []
 806         while tokens.incrementToken():
 807             cta = tokens.getAttribute(CharTermAttribute.class_)
 808             toks.append(cta.toString())
 809         return toks
 810
 811     def fuzziness(self, fuzzy):
 812         """Helper method to sanitize fuzziness"""
 813         if not fuzzy:
 814             return None
 815         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 816             return fuzzy
 817         else:
 818             return 0.5
 819
 820     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 821         """
 822         Return a PhraseQuery with a series of tokens.
 823         """
 824         if fuzzy:
 825             phrase = MultiPhraseQuery()
 826             for t in tokens:
 827                 term = Term(field, t)
 828                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 829                 fuzzterms = []
 830
 831                 while True:
 832                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 833                     ft = fuzzterm.term()
 834                     if ft:
 835                         fuzzterms.append(ft)
 836                     if not fuzzterm.next(): break
 837                 if fuzzterms:
 838                     phrase.add(JArray('object')(fuzzterms, Term))
 839                 else:
 840                     phrase.add(term)
 841         else:
 842             phrase = PhraseQuery()
 843             phrase.setSlop(slop)
 844             for t in tokens:
 845                 term = Term(field, t)
 846                 phrase.add(term)
 847         return phrase
 848
 849     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 850         """
 851         Returns term queries joined by boolean query.
 852         modal - applies to boolean query
 853         fuzzy - should the query by fuzzy.
 854         """
 855         q = BooleanQuery()
 856         for t in tokens:
 857             term = Term(field, t)
 858             if fuzzy:
 859                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 860             else:
 861                 term = TermQuery(term)
 862             q.add(BooleanClause(term, modal))
 863         return q
 864
 865     # def content_query(self, query):
 866     #     return BlockJoinQuery(query, self.parent_filter,
 867     #                           BlockJoinQuery.ScoreMode.Total)
 868
 869     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 870         """
 871         Search for perfect book matches. Just see if the query matches with some author or title,
 872         taking hints into account.
 873         """
 874         fields_to_search = ['authors', 'title']
 875         only_in = None
 876         if hint:
 877             if not hint.should_search_for_book():
 878                 return []
 879             fields_to_search = hint.just_search_in(fields_to_search)
 880             only_in = hint.book_filter()
 881
 882         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 883
 884         books = []
 885         for q in qrys:
 886             top = self.searcher.search(q,
 887                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 888                 max_results)
 889             for found in top.scoreDocs:
 890                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
 891         return books
 892
 893     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 894         fields_to_search = ['tags', 'authors', 'title']
 895
 896         only_in = None
 897         if hint:
 898             if not hint.should_search_for_book():
 899                 return []
 900             fields_to_search = hint.just_search_in(fields_to_search)
 901             only_in = hint.book_filter()
 902
 903         tokens = self.get_tokens(searched, field='SIMPLE')
 904
 905         q = BooleanQuery()
 906
 907         for fld in fields_to_search:
 908             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 909                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 910
 911         books = []
 912         top = self.searcher.search(q,
 913                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 914             max_results)
 915         for found in top.scoreDocs:
 916             books.append(SearchResult(self.searcher, found, how_found="search_book"))
 917
 918         return books
 919
 920     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 921         """
 922         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
 923         some part/fragment of the book.
 924         """
 925         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 926
 927         flt = None
 928         if hint:
 929             flt = hint.part_filter()
 930
 931         books = []
 932         for q in qrys:
 933             top = self.searcher.search(q,
 934                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 935                                                            flt]),
 936                                        max_results)
 937             for found in top.scoreDocs:
 938                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 939
 940         return books
 941
 942     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
 943         """
 944         Tries to use search terms to match different fields of book (or its parts).
 945         E.g. one word can be an author survey, another be a part of the title, and the rest
 946         are some words from third chapter.
 947         """
 948         books = []
 949         only_in = None
 950
 951         if hint:
 952             only_in = hint.part_filter()
 953
 954         # content only query : themes x content
 955         q = BooleanQuery()
 956
 957         tokens_pl = self.get_tokens(searched, field='content')
 958         tokens = self.get_tokens(searched, field='SIMPLE')
 959
 960         # only search in themes when we do not already filter by themes
 961         if hint is None or hint.just_search_in(['themes']) != []:
 962             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 963                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 964
 965         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 966                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 967
 968         topDocs = self.searcher.search(q, only_in, max_results)
 969         for found in topDocs.scoreDocs:
 970             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
 971             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 972
 973         # query themes/content x author/title/tags
 974         q = BooleanQuery()
 975         in_content = BooleanQuery()
 976         in_meta = BooleanQuery()
 977
 978         for fld in ['themes_pl', 'content']:
 979             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 980
 981         for fld in ['tags', 'authors', 'title']:
 982             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 983
 984         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 985         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 986
 987         topDocs = self.searcher.search(q, only_in, max_results)
 988         for found in topDocs.scoreDocs:
 989             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
 990             print "* %s scatter search: %s" % (searched, books[-1]._hits)
 991
 992         return books
 993
 994     # def multisearch(self, query, max_results=50):
 995     #     """
 996     #     Search strategy:
 997     #     - (phrase) OR -> content
 998     #                   -> title
 999     #                   -> authors
1000     #     - (keywords)  -> authors
1001     #                   -> motyw
1002     #                   -> tags
1003     #                   -> content
1004     #     """
1005         # queryreader = StringReader(query)
1006         # tokens = self.get_tokens(queryreader)
1007
1008         # top_level = BooleanQuery()
1009         # Should = BooleanClause.Occur.SHOULD
1010
1011         # phrase_level = BooleanQuery()
1012         # phrase_level.setBoost(1.3)
1013
1014         # p_content = self.make_phrase(tokens, joined=True)
1015         # p_title = self.make_phrase(tokens, 'title')
1016         # p_author = self.make_phrase(tokens, 'author')
1017
1018         # phrase_level.add(BooleanClause(p_content, Should))
1019         # phrase_level.add(BooleanClause(p_title, Should))
1020         # phrase_level.add(BooleanClause(p_author, Should))
1021
1022         # kw_level = BooleanQuery()
1023
1024         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1025         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1026         # kw_level.add(j_themes, Should)
1027         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1028         # j_con = self.make_term_query(tokens, joined=True)
1029         # kw_level.add(j_con, Should)
1030
1031         # top_level.add(BooleanClause(phrase_level, Should))
1032         # top_level.add(BooleanClause(kw_level, Should))
1033
1034         # return None
1035
1036     def get_snippets(self, scoreDoc, query, field='content'):
1037         """
1038         Returns a snippet for found scoreDoc.
1039         """
1040         htmlFormatter = SimpleHTMLFormatter()
1041         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1042
1043         stored = self.searcher.doc(scoreDoc.doc)
1044
1045         # locate content.
1046         snippets = Snippets(stored.get('book_id')).open()
1047         try:
1048             text = snippets.get((int(stored.get('snippets_position')),
1049                                  int(stored.get('snippets_length'))))
1050         finally:
1051             snippets.close()
1052
1053         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1054         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1055         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1056
1057         return snip
1058
1059     @staticmethod
1060     def enum_to_array(enum):
1061         """
1062         Converts a lucene TermEnum to array of Terms, suitable for
1063         addition to queries
1064         """
1065         terms = []
1066
1067         while True:
1068             t = enum.term()
1069             if t:
1070                 terms.append(t)
1071             if not enum.next(): break
1072
1073         if terms:
1074             return JArray('object')(terms, Term)
1075
1076     def search_tags(self, query, filter=None, max_results=40):
1077         """
1078         Search for Tag objects using query.
1079         """
1080         tops = self.searcher.search(query, filter, max_results)
1081
1082         tags = []
1083         for found in tops.scoreDocs:
1084             doc = self.searcher.doc(found.doc)
1085             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1086             tags.append(tag)
1087             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1088
1089         return tags
1090
1091     def search_books(self, query, filter=None, max_results=10):
1092         """
1093         Searches for Book objects using query
1094         """
1095         bks = []
1096         tops = self.searcher.search(query, filter, max_results)
1097         for found in tops.scoreDocs:
1098             doc = self.searcher.doc(found.doc)
1099             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1100         return bks
1101
1102     def create_prefix_phrase(self, toks, field):
1103         q = MultiPhraseQuery()
1104         for i in range(len(toks)):
1105             t = Term(field, toks[i])
1106             if i == len(toks) - 1:
1107                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1108                 if pterms:
1109                     q.add(pterms)
1110                 else:
1111                     q.add(t)
1112             else:
1113                 q.add(t)
1114         return q
1115
1116     @staticmethod
1117     def term_filter(term, inverse=False):
1118         only_term = TermsFilter()
1119         only_term.addTerm(term)
1120
1121         if inverse:
1122             neg = BooleanFilter()
1123             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1124             only_term = neg
1125
1126         return only_term
1127
1128     def hint_tags(self, string, max_results=50):
1129         """
1130         Return auto-complete hints for tags
1131         using prefix search.
1132         """
1133         toks = self.get_tokens(string, field='SIMPLE')
1134         top = BooleanQuery()
1135
1136         for field in ['tag_name', 'tag_name_pl']:
1137             q = self.create_prefix_phrase(toks, field)
1138             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1139
1140         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1141
1142         return self.search_tags(top, no_book_cat, max_results=max_results)
1143
1144     def hint_books(self, string, max_results=50):
1145         """
1146         Returns auto-complete hints for book titles
1147         Because we do not index 'pseudo' title-tags.
1148         Prefix search.
1149         """
1150         toks = self.get_tokens(string, field='SIMPLE')
1151
1152         q = self.create_prefix_phrase(toks, 'title')
1153
1154         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1155
1156     @staticmethod
1157     def chain_filters(filters, op=ChainedFilter.AND):
1158         """
1159         Chains a filter list together
1160         """
1161         filters = filter(lambda x: x is not None, filters)
1162         if not filters:
1163             return None
1164         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1165         return chf
1166
1167     def filtered_categories(self, tags):
1168         """
1169         Return a list of tag categories, present in tags list.
1170         """
1171         cats = {}
1172         for t in tags:
1173             cats[t.category] = True
1174         return cats.keys()
1175
1176     def hint(self):
1177         return Hint(self)