apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from multiprocessing.pool import ThreadPool
  31 from threading import current_thread
  32 import atexit
  33 import traceback
  34
  35
  36 class WLAnalyzer(PerFieldAnalyzerWrapper):
  37     def __init__(self):
  38         polish = PolishAnalyzer(Version.LUCENE_34)
  39         #        polish_gap.setPositionIncrementGap(999)
  40
  41         simple = SimpleAnalyzer(Version.LUCENE_34)
  42         #        simple_gap.setPositionIncrementGap(999)
  43
  44         keyword = KeywordAnalyzer(Version.LUCENE_34)
  45
  46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  47
  48         PerFieldAnalyzerWrapper.__init__(self, polish)
  49
  50         self.addAnalyzer("tags", simple)
  51         self.addAnalyzer("technical_editors", simple)
  52         self.addAnalyzer("editors", simple)
  53         self.addAnalyzer("url", keyword)
  54         self.addAnalyzer("source_url", keyword)
  55         self.addAnalyzer("source_name", simple)
  56         self.addAnalyzer("publisher", simple)
  57         self.addAnalyzer("authors", simple)
  58         self.addAnalyzer("title", simple)
  59
  60         self.addAnalyzer("is_book", keyword)
  61         # shouldn't the title have two forms? _pl and simple?
  62
  63         self.addAnalyzer("themes", simple)
  64         self.addAnalyzer("themes_pl", polish)
  65
  66         self.addAnalyzer("tag_name", simple)
  67         self.addAnalyzer("tag_name_pl", polish)
  68
  69         self.addAnalyzer("translators", simple)
  70
  71         self.addAnalyzer("KEYWORD", keyword)
  72         self.addAnalyzer("SIMPLE", simple)
  73         self.addAnalyzer("POLISH", polish)
  74
  75
  76 class IndexStore(object):
  77     """
  78     Provides access to search index.
  79
  80     self.store - lucene index directory
  81     """
  82     def __init__(self):
  83         self.make_index_dir()
  84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  85
  86     def make_index_dir(self):
  87         try:
  88             os.makedirs(settings.SEARCH_INDEX)
  89         except OSError as exc:
  90             if exc.errno == errno.EEXIST:
  91                 pass
  92             else: raise
  93
  94
  95 class IndexChecker(IndexStore):
  96     def __init__(self):
  97         IndexStore.__init__(self)
  98
  99     def check(self):
 100         checker = CheckIndex(self.store)
 101         status = checker.checkIndex()
 102         return status
 103
 104
 105 class Snippets(object):
 106     """
 107     This class manages snippet files for indexed object (book)
 108     the snippets are concatenated together, and their positions and
 109     lengths are kept in lucene index fields.
 110     """
 111     SNIPPET_DIR = "snippets"
 112
 113     def __init__(self, book_id):
 114         try:
 115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 116         except OSError as exc:
 117             if exc.errno == errno.EEXIST:
 118                 pass
 119             else: raise
 120         self.book_id = book_id
 121         self.file = None
 122
 123     def open(self, mode='r'):
 124         """
 125         Open the snippet file. Call .close() afterwards.
 126         """
 127         if not 'b' in mode:
 128             mode += 'b'
 129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 130         self.position = 0
 131         return self
 132
 133     def add(self, snippet):
 134         """
 135         Append a snippet (unicode) to the snippet file.
 136         Return a (position, length) tuple
 137         """
 138         txt = snippet.encode('utf-8')
 139         l = len(txt)
 140         self.file.write(txt)
 141         pos = (self.position, l)
 142         self.position += l
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad'
 268         ]
 269
 270     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 271
 272     def extract_metadata(self, book, book_info=None):
 273         """
 274         Extract metadata from book and returns a map of fields keyed by fieldname
 275         """
 276         fields = {}
 277
 278         if book_info is None:
 279             book_info = dcparser.parse(open(book.xml_file.path))
 280
 281         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 282         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 283         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 284
 285         # validator, name
 286         for field in dcparser.BookInfo.FIELDS:
 287             if hasattr(book_info, field.name):
 288                 if not getattr(book_info, field.name):
 289                     continue
 290                 # since no type information is available, we use validator
 291                 type_indicator = field.validator
 292                 if type_indicator == dcparser.as_unicode:
 293                     s = getattr(book_info, field.name)
 294                     if field.multiple:
 295                         s = ', '.join(s)
 296                     try:
 297                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 298                     except JavaError as je:
 299                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 300                 elif type_indicator == dcparser.as_person:
 301                     p = getattr(book_info, field.name)
 302                     if isinstance(p, dcparser.Person):
 303                         persons = unicode(p)
 304                     else:
 305                         persons = ', '.join(map(unicode, p))
 306                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 307                 elif type_indicator == dcparser.as_date:
 308                     dt = getattr(book_info, field.name)
 309                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 310                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 311
 312         return fields
 313
 314     def add_gaps(self, fields, fieldname):
 315         """
 316         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 317         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 318         """
 319         def gap():
 320             while True:
 321                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 322         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 323
 324     def get_master(self, root):
 325         """
 326         Returns the first master tag from an etree.
 327         """
 328         for master in root.iter():
 329             if master.tag in self.master_tags:
 330                 return master
 331
 332     def index_content(self, book, book_fields=[]):
 333         """
 334         Walks the book XML and extract content from it.
 335         Adds parts for each header tag and for each fragment.
 336         """
 337         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 338         root = wld.edoc.getroot()
 339
 340         master = self.get_master(root)
 341         if master is None:
 342             return []
 343
 344         def walker(node):
 345             yield node, None
 346             for child in list(node):
 347                 for b, e in walker(child):
 348                     yield b, e
 349             yield None, node
 350             return
 351
 352         def fix_format(text):
 353             return re.sub("(?m)/$", "", text)
 354
 355         def add_part(snippets, **fields):
 356             doc = self.create_book_doc(book)
 357             for f in book_fields:
 358                 doc.add(f)
 359
 360             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 361             doc.add(NumericField("header_span", Field.Store.YES, True)\
 362                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 363             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 364
 365             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 366                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 367
 368             snip_pos = snippets.add(fields["content"])
 369             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 370             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 371
 372             if 'fragment_anchor' in fields:
 373                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 374                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 375
 376             if 'themes' in fields:
 377                 themes, themes_pl = zip(*[
 378                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 379                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 380                      for theme in fields['themes']])
 381
 382                 themes = self.add_gaps(themes, 'themes')
 383                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 384
 385                 for t in themes:
 386                     doc.add(t)
 387                 for t in themes_pl:
 388                     doc.add(t)
 389
 390             return doc
 391
 392         def give_me_utf8(s):
 393             if isinstance(s, unicode):
 394                 return s.encode('utf-8')
 395             else:
 396                 return s
 397
 398         fragments = {}
 399         snippets = Snippets(book.id).open('w')
 400         position = 0
 401         try:
 402             for header in list(master):
 403
 404                 if header.tag in self.skip_header_tags:
 405                     continue
 406                 if header.tag is etree.Comment:
 407                     continue
 408
 409                 # section content
 410                 content = []
 411
 412                 for start, end in walker(header):
 413                         # handle fragments and themes.
 414                     if start is not None and start.tag == 'begin':
 415                         fid = start.attrib['id'][1:]
 416                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 417
 418                     elif start is not None and start.tag == 'motyw':
 419                         fid = start.attrib['id'][1:]
 420                         if start.text is not None:
 421                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 422
 423                     elif start is not None and start.tag == 'end':
 424                         fid = start.attrib['id'][1:]
 425                         if fid not in fragments:
 426                             continue  # a broken <end> node, skip it
 427                                       #                        import pdb; pdb.set_trace()
 428                         frag = fragments[fid]
 429                         if frag['themes'] == []:
 430                             continue  # empty themes list.
 431                         del fragments[fid]
 432
 433                         def jstr(l):
 434                             return u' '.join(map(
 435                                 lambda x: x == None and u'(none)' or unicode(x),
 436                                 l))
 437
 438                         doc = add_part(snippets,
 439                                        header_type=frag['start_header'],
 440                                        header_index=frag['start_section'],
 441                                        header_span=position - frag['start_section'] + 1,
 442                                        fragment_anchor=fid,
 443                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 444                                        themes=frag['themes'])
 445
 446                         self.index.addDocument(doc)
 447
 448                         # Collect content.
 449                     elif start is not None:
 450                         for frag in fragments.values():
 451                             frag['content'].append(start.text)
 452                         content.append(start.text)
 453                     elif end is not None:
 454                         for frag in fragments.values():
 455                             frag['content'].append(end.tail)
 456                         content.append(end.tail)
 457
 458                         # in the end, add a section text.
 459                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 460                                content=fix_format(u' '.join(filter(lambda s: s is not None, content))))
 461
 462                 self.index.addDocument(doc)
 463                 position += 1
 464
 465         finally:
 466             snippets.close()
 467
 468
 469 def log_exception_wrapper(f):
 470     def _wrap(*a):
 471         try:
 472             f(*a)
 473         except Exception, e:
 474             print("Error in indexing thread: %s" % e)
 475             traceback.print_exc()
 476             raise e
 477     return _wrap
 478
 479
 480 class ReusableIndex(Index):
 481     """
 482     Works like index, but does not close/optimize Lucene index
 483     until program exit (uses atexit hook).
 484     This is usefull for importbooks command.
 485
 486     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 487     """
 488     index = None
 489
 490     def open(self, analyzer=None, threads=4):
 491         if ReusableIndex.index is not None:
 492             self.index = ReusableIndex.index
 493         else:
 494             print("opening index")
 495             Index.open(self, analyzer)
 496             ReusableIndex.index = self.index
 497             atexit.register(ReusableIndex.close_reusable)
 498
 499     # def index_book(self, *args, **kw):
 500     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 501     #     ReusableIndex.pool_jobs.append(job)
 502
 503     @staticmethod
 504     def close_reusable():
 505         if ReusableIndex.index is not None:
 506             ReusableIndex.index.optimize()
 507             ReusableIndex.index.close()
 508             ReusableIndex.index = None
 509
 510     def close(self):
 511         pass
 512
 513
 514 class JoinSearch(object):
 515     """
 516     This mixin could be used to handle block join queries.
 517     (currently unused)
 518     """
 519     def __init__(self, *args, **kw):
 520         super(JoinSearch, self).__init__(*args, **kw)
 521
 522     def wrapjoins(self, query, fields=[]):
 523         """
 524         This functions modifies the query in a recursive way,
 525         so Term and Phrase Queries contained, which match
 526         provided fields are wrapped in a BlockJoinQuery,
 527         and so delegated to children documents.
 528         """
 529         if BooleanQuery.instance_(query):
 530             qs = BooleanQuery.cast_(query)
 531             for clause in qs:
 532                 clause = BooleanClause.cast_(clause)
 533                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 534             return qs
 535         else:
 536             termset = HashSet()
 537             query.extractTerms(termset)
 538             for t in termset:
 539                 t = Term.cast_(t)
 540                 if t.field() not in fields:
 541                     return query
 542             return BlockJoinQuery(query, self.parent_filter,
 543                                   BlockJoinQuery.ScoreMode.Total)
 544
 545     def bsearch(self, query, max_results=50):
 546         q = self.query(query)
 547         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 548
 549         tops = self.searcher.search(bjq, max_results)
 550         bks = []
 551         for found in tops.scoreDocs:
 552             doc = self.searcher.doc(found.doc)
 553             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 554         return (bks, tops.totalHits)
 555
 556
 557 class SearchResult(object):
 558     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 559         if tokens_cache is None: tokens_cache = {}
 560
 561         if score:
 562             self._score = score
 563         else:
 564             self._score = scoreDocs.score
 565
 566         self.boost = 1.0
 567
 568         self._hits = []
 569         self._processed_hits = None  # processed hits
 570
 571         stored = searcher.doc(scoreDocs.doc)
 572         self.book_id = int(stored.get("book_id"))
 573
 574         header_type = stored.get("header_type")
 575         if not header_type:
 576             return
 577
 578         sec = (header_type, int(stored.get("header_index")))
 579         header_span = stored.get('header_span')
 580         header_span = header_span is not None and int(header_span) or 1
 581
 582         fragment = stored.get("fragment_anchor")
 583
 584         if snippets:
 585             snippets = snippets.replace("/\n", "\n")
 586         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 587
 588         self._hits.append(hit)
 589
 590         self.searcher = searcher
 591         self.searched = searched
 592         self.tokens_cache = tokens_cache
 593
 594     @property
 595     def score(self):
 596         return self._score * self.boost
 597
 598     def merge(self, other):
 599         if self.book_id != other.book_id:
 600             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 601         self._hits += other._hits
 602         if other.score > self.score:
 603             self.score = other.score
 604         return self
 605
 606     def get_book(self):
 607         return catalogue.models.Book.objects.get(id=self.book_id)
 608
 609     book = property(get_book)
 610
 611     @property
 612     def hits(self):
 613         if self._processed_hits is not None:
 614             return self._processed_hits
 615
 616         POSITION = 0
 617         FRAGMENT = 1
 618         POSITION_INDEX = 1
 619         POSITION_SPAN = 2
 620         SCORE = 2
 621         OTHER = 3
 622
 623         # to sections and fragments
 624         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 625         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 626         sect = filter(lambda s: 0 == len(filter(
 627             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 628             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 629             frags)), sect)
 630
 631         hits = []
 632
 633         # remove duplicate fragments
 634         fragments = {}
 635         for f in frags:
 636             fid = f[FRAGMENT]
 637             if fid in fragments:
 638                 if fragments[fid][SCORE] >= f[SCORE]:
 639                     continue
 640             fragments[fid] = f
 641         frags = fragments.values()
 642
 643         # remove duplicate sections
 644         sections = {}
 645
 646         for s in sect:
 647             si = s[POSITION][POSITION_INDEX]
 648             # skip existing
 649             if si in sections:
 650                 if sections[si]['score'] >= s[SCORE]:
 651                     continue
 652
 653             m = {'score': s[SCORE],
 654                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 655                  }
 656             m.update(s[OTHER])
 657             sections[si] = m
 658
 659         hits = sections.values()
 660
 661         for f in frags:
 662             try:
 663                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 664             except catalogue.models.Fragment.DoesNotExist:
 665                 # stale index
 666                 continue
 667
 668             # Figure out if we were searching for a token matching some word in theme name.
 669             themes = frag.tags.filter(category='theme')
 670             themes_hit = []
 671             if self.searched is not None:
 672                 tokens = self.searcher.get_tokens(self.searched, 'POLISH', tokens_cache=self.tokens_cache)
 673                 for theme in themes:
 674                     name_tokens = self.searcher.get_tokens(theme.name, 'POLISH')
 675                     for t in tokens:
 676                         if name_tokens.index(t):
 677                             if not theme in themes_hit:
 678                                 themes_hit.append(theme)
 679                             break
 680
 681             m = {'score': f[SCORE],
 682                  'fragment': frag,
 683                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 684                  'themes': themes,
 685                  'themes_hit': themes_hit
 686                  }
 687             m.update(f[OTHER])
 688             hits.append(m)
 689
 690         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 691
 692         self._processed_hits = hits
 693
 694         return hits
 695
 696     def __unicode__(self):
 697         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 698
 699     @staticmethod
 700     def aggregate(*result_lists):
 701         books = {}
 702         for rl in result_lists:
 703             for r in rl:
 704                 if r.book_id in books:
 705                     books[r.book_id].merge(r)
 706                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 707                 else:
 708                     books[r.book_id] = r
 709         return books.values()
 710
 711     def __cmp__(self, other):
 712         return cmp(self.score, other.score)
 713
 714
 715 class Hint(object):
 716     """
 717     Given some hint information (information we already know about)
 718     our search target - like author, title (specific book), epoch, genre, kind
 719     we can narrow down search using filters.
 720     """
 721     def __init__(self, search):
 722         """
 723         Accepts a Searcher instance.
 724         """
 725         self.search = search
 726         self.book_tags = {}
 727         self.part_tags = []
 728         self._books = []
 729
 730     def books(self, *books):
 731         """
 732         Give a hint that we search these books.
 733         """
 734         self._books = books
 735
 736     def tags(self, tags):
 737         """
 738         Give a hint that these Tag objects (a list of)
 739         is necessary.
 740         """
 741         for t in tags:
 742             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 743                 lst = self.book_tags.get(t.category, [])
 744                 lst.append(t)
 745                 self.book_tags[t.category] = lst
 746             if t.category in ['theme', 'theme_pl']:
 747                 self.part_tags.append(t)
 748
 749     def tag_filter(self, tags, field='tags'):
 750         """
 751         Given a lsit of tags and an optional field (but they are normally in tags field)
 752         returns a filter accepting only books with specific tags.
 753         """
 754         q = BooleanQuery()
 755
 756         for tag in tags:
 757             toks = self.search.get_tokens(tag.name, field=field)
 758             tag_phrase = PhraseQuery()
 759             for tok in toks:
 760                 tag_phrase.add(Term(field, tok))
 761             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 762
 763         return QueryWrapperFilter(q)
 764
 765     def book_filter(self):
 766         """
 767         Filters using book tags (all tag kinds except a theme)
 768         """
 769         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 770         if tags:
 771             return self.tag_filter(tags)
 772         else:
 773             return None
 774
 775     def part_filter(self):
 776         """
 777         This filter can be used to look for book parts.
 778         It filters on book id and/or themes.
 779         """
 780         fs = []
 781         if self.part_tags:
 782             fs.append(self.tag_filter(self.part_tags, field='themes'))
 783
 784         if self._books != []:
 785             bf = BooleanFilter()
 786             for b in self._books:
 787                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 788                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 789             fs.append(bf)
 790
 791         return Search.chain_filters(fs)
 792
 793     def should_search_for_book(self):
 794         return self._books == []
 795
 796     def just_search_in(self, all):
 797         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 798         some = []
 799         for field in all:
 800             if field == 'authors' and 'author' in self.book_tags:
 801                 continue
 802             if field == 'title' and self._books != []:
 803                 continue
 804             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 805                 continue
 806             some.append(field)
 807         return some
 808
 809
 810 class Search(IndexStore):
 811     """
 812     Search facilities.
 813     """
 814     def __init__(self, default_field="content"):
 815         IndexStore.__init__(self)
 816         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 817         # self.analyzer = WLAnalyzer()
 818         self.searcher = IndexSearcher(self.store, True)
 819         self.parser = QueryParser(Version.LUCENE_34, default_field,
 820                                   self.analyzer)
 821
 822         self.parent_filter = TermsFilter()
 823         self.parent_filter.addTerm(Term("is_book", "true"))
 824
 825     def query(self, query):
 826         """Parse query in default Lucene Syntax. (for humans)
 827         """
 828         return self.parser.parse(query)
 829
 830     def simple_search(self, query, max_results=50):
 831         """Runs a query for books using lucene syntax. (for humans)
 832         Returns (books, total_hits)
 833         """
 834
 835         tops = self.searcher.search(self.query(query), max_results)
 836         bks = []
 837         for found in tops.scoreDocs:
 838             doc = self.searcher.doc(found.doc)
 839             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 840         return (bks, tops.totalHits)
 841
 842     def get_tokens(self, searched, field='content', cached=None):
 843         """returns tokens analyzed by a proper (for a field) analyzer
 844         argument can be: StringReader, string/unicode, or tokens. In the last case
 845         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 846         """
 847         if cached is not None and field in cached:
 848             return cached[field]
 849
 850         if isinstance(searched, str) or isinstance(searched, unicode):
 851             searched = StringReader(searched)
 852         elif isinstance(searched, list):
 853             return searched
 854
 855         searched.reset()
 856         tokens = self.analyzer.reusableTokenStream(field, searched)
 857         toks = []
 858         while tokens.incrementToken():
 859             cta = tokens.getAttribute(CharTermAttribute.class_)
 860             toks.append(cta.toString())
 861
 862         if cached is not None:
 863             cached[field] = toks
 864
 865         return toks
 866
 867     def fuzziness(self, fuzzy):
 868         """Helper method to sanitize fuzziness"""
 869         if not fuzzy:
 870             return None
 871         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 872             return fuzzy
 873         else:
 874             return 0.5
 875
 876     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 877         """
 878         Return a PhraseQuery with a series of tokens.
 879         """
 880         if fuzzy:
 881             phrase = MultiPhraseQuery()
 882             for t in tokens:
 883                 term = Term(field, t)
 884                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 885                 fuzzterms = []
 886
 887                 while True:
 888                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 889                     ft = fuzzterm.term()
 890                     if ft:
 891                         fuzzterms.append(ft)
 892                     if not fuzzterm.next(): break
 893                 if fuzzterms:
 894                     phrase.add(JArray('object')(fuzzterms, Term))
 895                 else:
 896                     phrase.add(term)
 897         else:
 898             phrase = PhraseQuery()
 899             phrase.setSlop(slop)
 900             for t in tokens:
 901                 term = Term(field, t)
 902                 phrase.add(term)
 903         return phrase
 904
 905     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 906         """
 907         Returns term queries joined by boolean query.
 908         modal - applies to boolean query
 909         fuzzy - should the query by fuzzy.
 910         """
 911         q = BooleanQuery()
 912         for t in tokens:
 913             term = Term(field, t)
 914             if fuzzy:
 915                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 916             else:
 917                 term = TermQuery(term)
 918             q.add(BooleanClause(term, modal))
 919         return q
 920
 921     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 922                       filters=None, tokens_cache=None, boost=None, snippets=False):
 923         if filters is None: filters = []
 924         if tokens_cache is None: tokens_cache = {}
 925
 926         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 927
 928         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy)
 929         if book:
 930             filters.append(self.term_filter(Term('is_book', 'true')))
 931         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 932
 933         return [SearchResult(self.searcher, found, snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
 934
 935     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 936                     filters=None, tokens_cache=None, boost=None):
 937         if filters is None: filters = []
 938         if tokens_cache is None: tokens_cache = {}
 939
 940         if book:
 941             filters.append(self.term_filter(Term('is_book', 'true')))
 942
 943         query = BooleanQuery()
 944
 945         for fld in fields:
 946             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 947
 948             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 949                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 950
 951         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 952
 953         return [SearchResult(self.searcher, found, searched=searched, tokens_cache=tokens_cache, snippets=self.get_snippets(found, query)) for found in top.scoreDocs]
 954
 955     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 956         """
 957         Search for perfect book matches. Just see if the query matches with some author or title,
 958         taking hints into account.
 959         """
 960         fields_to_search = ['authors', 'title']
 961         only_in = None
 962         if hint:
 963             if not hint.should_search_for_book():
 964                 return []
 965             fields_to_search = hint.just_search_in(fields_to_search)
 966             only_in = hint.book_filter()
 967
 968         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 969
 970         books = []
 971         for q in qrys:
 972             top = self.searcher.search(q,
 973                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 974                 max_results)
 975             for found in top.scoreDocs:
 976                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
 977         return books
 978
 979     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 980         fields_to_search = ['tags', 'authors', 'title']
 981
 982         only_in = None
 983         if hint:
 984             if not hint.should_search_for_book():
 985                 return []
 986             fields_to_search = hint.just_search_in(fields_to_search)
 987             only_in = hint.book_filter()
 988
 989         tokens = self.get_tokens(searched, field='SIMPLE')
 990
 991         q = BooleanQuery()
 992
 993         for fld in fields_to_search:
 994             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 995                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 996
 997         books = []
 998         top = self.searcher.search(q,
 999                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1000             max_results)
1001         for found in top.scoreDocs:
1002             books.append(SearchResult(self.searcher, found, how_found="search_book"))
1003
1004         return books
1005
1006     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1007         """
1008         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1009         some part/fragment of the book.
1010         """
1011         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1012
1013         flt = None
1014         if hint:
1015             flt = hint.part_filter()
1016
1017         books = []
1018         for q in qrys:
1019             top = self.searcher.search(q,
1020                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1021                                                            flt]),
1022                                        max_results)
1023             for found in top.scoreDocs:
1024                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1025
1026         return books
1027
1028     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1029         """
1030         Tries to use search terms to match different fields of book (or its parts).
1031         E.g. one word can be an author survey, another be a part of the title, and the rest
1032         are some words from third chapter.
1033         """
1034         if tokens_cache is None: tokens_cache = {}
1035         books = []
1036         only_in = None
1037
1038         if hint:
1039             only_in = hint.part_filter()
1040
1041         # content only query : themes x content
1042         q = BooleanQuery()
1043
1044         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1045         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1046
1047         # only search in themes when we do not already filter by themes
1048         if hint is None or hint.just_search_in(['themes']) != []:
1049             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1050                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1051
1052         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1053                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1054
1055         topDocs = self.searcher.search(q, only_in, max_results)
1056         for found in topDocs.scoreDocs:
1057             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
1058             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1059
1060         # query themes/content x author/title/tags
1061         q = BooleanQuery()
1062         in_content = BooleanQuery()
1063         in_meta = BooleanQuery()
1064
1065         for fld in ['themes_pl', 'content']:
1066             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1067
1068         for fld in ['tags', 'authors', 'title']:
1069             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1070
1071         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1072         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1073
1074         topDocs = self.searcher.search(q, only_in, max_results)
1075         for found in topDocs.scoreDocs:
1076             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
1077             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1078
1079         return books
1080
1081     # def multisearch(self, query, max_results=50):
1082     #     """
1083     #     Search strategy:
1084     #     - (phrase) OR -> content
1085     #                   -> title
1086     #                   -> authors
1087     #     - (keywords)  -> authors
1088     #                   -> motyw
1089     #                   -> tags
1090     #                   -> content
1091     #     """
1092         # queryreader = StringReader(query)
1093         # tokens = self.get_tokens(queryreader)
1094
1095         # top_level = BooleanQuery()
1096         # Should = BooleanClause.Occur.SHOULD
1097
1098         # phrase_level = BooleanQuery()
1099         # phrase_level.setBoost(1.3)
1100
1101         # p_content = self.make_phrase(tokens, joined=True)
1102         # p_title = self.make_phrase(tokens, 'title')
1103         # p_author = self.make_phrase(tokens, 'author')
1104
1105         # phrase_level.add(BooleanClause(p_content, Should))
1106         # phrase_level.add(BooleanClause(p_title, Should))
1107         # phrase_level.add(BooleanClause(p_author, Should))
1108
1109         # kw_level = BooleanQuery()
1110
1111         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1112         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1113         # kw_level.add(j_themes, Should)
1114         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1115         # j_con = self.make_term_query(tokens, joined=True)
1116         # kw_level.add(j_con, Should)
1117
1118         # top_level.add(BooleanClause(phrase_level, Should))
1119         # top_level.add(BooleanClause(kw_level, Should))
1120
1121         # return None
1122
1123     def get_snippets(self, scoreDoc, query, field='content'):
1124         """
1125         Returns a snippet for found scoreDoc.
1126         """
1127         htmlFormatter = SimpleHTMLFormatter()
1128         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1129
1130         stored = self.searcher.doc(scoreDoc.doc)
1131
1132         position = stored.get('snippets_position')
1133         length = stored.get('snippets_length')
1134         if position is None or length is None:
1135             return None
1136         # locate content.
1137         snippets = Snippets(stored.get('book_id')).open()
1138         try:
1139             text = snippets.get((int(position),
1140                                  int(length)))
1141         finally:
1142             snippets.close()
1143
1144         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1145         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1146         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1147
1148         return snip
1149
1150     @staticmethod
1151     def enum_to_array(enum):
1152         """
1153         Converts a lucene TermEnum to array of Terms, suitable for
1154         addition to queries
1155         """
1156         terms = []
1157
1158         while True:
1159             t = enum.term()
1160             if t:
1161                 terms.append(t)
1162             if not enum.next(): break
1163
1164         if terms:
1165             return JArray('object')(terms, Term)
1166
1167     def search_tags(self, query, filter=None, max_results=40):
1168         """
1169         Search for Tag objects using query.
1170         """
1171         tops = self.searcher.search(query, filter, max_results)
1172
1173         tags = []
1174         for found in tops.scoreDocs:
1175             doc = self.searcher.doc(found.doc)
1176             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1177             tags.append(tag)
1178             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1179
1180         return tags
1181
1182     def search_books(self, query, filter=None, max_results=10):
1183         """
1184         Searches for Book objects using query
1185         """
1186         bks = []
1187         tops = self.searcher.search(query, filter, max_results)
1188         for found in tops.scoreDocs:
1189             doc = self.searcher.doc(found.doc)
1190             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1191         return bks
1192
1193     def create_prefix_phrase(self, toks, field):
1194         q = MultiPhraseQuery()
1195         for i in range(len(toks)):
1196             t = Term(field, toks[i])
1197             if i == len(toks) - 1:
1198                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1199                 if pterms:
1200                     q.add(pterms)
1201                 else:
1202                     q.add(t)
1203             else:
1204                 q.add(t)
1205         return q
1206
1207     @staticmethod
1208     def term_filter(term, inverse=False):
1209         only_term = TermsFilter()
1210         only_term.addTerm(term)
1211
1212         if inverse:
1213             neg = BooleanFilter()
1214             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1215             only_term = neg
1216
1217         return only_term
1218
1219     def hint_tags(self, string, max_results=50):
1220         """
1221         Return auto-complete hints for tags
1222         using prefix search.
1223         """
1224         toks = self.get_tokens(string, field='SIMPLE')
1225         top = BooleanQuery()
1226
1227         for field in ['tag_name', 'tag_name_pl']:
1228             q = self.create_prefix_phrase(toks, field)
1229             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1230
1231         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1232
1233         return self.search_tags(top, no_book_cat, max_results=max_results)
1234
1235     def hint_books(self, string, max_results=50):
1236         """
1237         Returns auto-complete hints for book titles
1238         Because we do not index 'pseudo' title-tags.
1239         Prefix search.
1240         """
1241         toks = self.get_tokens(string, field='SIMPLE')
1242
1243         q = self.create_prefix_phrase(toks, 'title')
1244
1245         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1246
1247     @staticmethod
1248     def chain_filters(filters, op=ChainedFilter.AND):
1249         """
1250         Chains a filter list together
1251         """
1252         filters = filter(lambda x: x is not None, filters)
1253         if not filters or filters is []:
1254             return None
1255         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1256         return chf
1257
1258     def filtered_categories(self, tags):
1259         """
1260         Return a list of tag categories, present in tags list.
1261         """
1262         cats = {}
1263         for t in tags:
1264             cats[t.category] = True
1265         return cats.keys()
1266
1267     def hint(self):
1268         return Hint(self)