apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from multiprocessing.pool import ThreadPool
  31 from threading import current_thread
  32 import atexit
  33 import traceback
  34
  35
  36 class WLAnalyzer(PerFieldAnalyzerWrapper):
  37     def __init__(self):
  38         polish = PolishAnalyzer(Version.LUCENE_34)
  39         #        polish_gap.setPositionIncrementGap(999)
  40
  41         simple = SimpleAnalyzer(Version.LUCENE_34)
  42         #        simple_gap.setPositionIncrementGap(999)
  43
  44         keyword = KeywordAnalyzer(Version.LUCENE_34)
  45
  46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  47
  48         PerFieldAnalyzerWrapper.__init__(self, polish)
  49
  50         self.addAnalyzer("tags", simple)
  51         self.addAnalyzer("technical_editors", simple)
  52         self.addAnalyzer("editors", simple)
  53         self.addAnalyzer("url", keyword)
  54         self.addAnalyzer("source_url", keyword)
  55         self.addAnalyzer("source_name", simple)
  56         self.addAnalyzer("publisher", simple)
  57         self.addAnalyzer("authors", simple)
  58         self.addAnalyzer("title", simple)
  59
  60         self.addAnalyzer("is_book", keyword)
  61         # shouldn't the title have two forms? _pl and simple?
  62
  63         self.addAnalyzer("themes", simple)
  64         self.addAnalyzer("themes_pl", polish)
  65
  66         self.addAnalyzer("tag_name", simple)
  67         self.addAnalyzer("tag_name_pl", polish)
  68
  69         self.addAnalyzer("translators", simple)
  70
  71         self.addAnalyzer("KEYWORD", keyword)
  72         self.addAnalyzer("SIMPLE", simple)
  73         self.addAnalyzer("POLISH", polish)
  74
  75
  76 class IndexStore(object):
  77     """
  78     Provides access to search index.
  79
  80     self.store - lucene index directory
  81     """
  82     def __init__(self):
  83         self.make_index_dir()
  84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  85
  86     def make_index_dir(self):
  87         try:
  88             os.makedirs(settings.SEARCH_INDEX)
  89         except OSError as exc:
  90             if exc.errno == errno.EEXIST:
  91                 pass
  92             else: raise
  93
  94
  95 class IndexChecker(IndexStore):
  96     def __init__(self):
  97         IndexStore.__init__(self)
  98
  99     def check(self):
 100         checker = CheckIndex(self.store)
 101         status = checker.checkIndex()
 102         return status
 103
 104
 105 class Snippets(object):
 106     """
 107     This class manages snippet files for indexed object (book)
 108     the snippets are concatenated together, and their positions and
 109     lengths are kept in lucene index fields.
 110     """
 111     SNIPPET_DIR = "snippets"
 112
 113     def __init__(self, book_id):
 114         try:
 115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 116         except OSError as exc:
 117             if exc.errno == errno.EEXIST:
 118                 pass
 119             else: raise
 120         self.book_id = book_id
 121         self.file = None
 122
 123     def open(self, mode='r'):
 124         """
 125         Open the snippet file. Call .close() afterwards.
 126         """
 127         if not 'b' in mode:
 128             mode += 'b'
 129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 130         self.position = 0
 131         return self
 132
 133     def add(self, snippet):
 134         """
 135         Append a snippet (unicode) to the snippet file.
 136         Return a (position, length) tuple
 137         """
 138         txt = snippet.encode('utf-8')
 139         l = len(txt)
 140         self.file.write(txt)
 141         pos = (self.position, l)
 142         self.position += l
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad'
 268         ]
 269
 270     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 271
 272     def extract_metadata(self, book, book_info=None):
 273         """
 274         Extract metadata from book and returns a map of fields keyed by fieldname
 275         """
 276         fields = {}
 277
 278         if book_info is None:
 279             book_info = dcparser.parse(open(book.xml_file.path))
 280
 281         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 282         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 283         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 284
 285         # validator, name
 286         for field in dcparser.BookInfo.FIELDS:
 287             if hasattr(book_info, field.name):
 288                 if not getattr(book_info, field.name):
 289                     continue
 290                 # since no type information is available, we use validator
 291                 type_indicator = field.validator
 292                 if type_indicator == dcparser.as_unicode:
 293                     s = getattr(book_info, field.name)
 294                     if field.multiple:
 295                         s = ', '.join(s)
 296                     try:
 297                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 298                     except JavaError as je:
 299                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 300                 elif type_indicator == dcparser.as_person:
 301                     p = getattr(book_info, field.name)
 302                     if isinstance(p, dcparser.Person):
 303                         persons = unicode(p)
 304                     else:
 305                         persons = ', '.join(map(unicode, p))
 306                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 307                 elif type_indicator == dcparser.as_date:
 308                     dt = getattr(book_info, field.name)
 309                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 310                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 311
 312         return fields
 313
 314     def add_gaps(self, fields, fieldname):
 315         """
 316         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 317         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 318         """
 319         def gap():
 320             while True:
 321                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 322         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 323
 324     def get_master(self, root):
 325         """
 326         Returns the first master tag from an etree.
 327         """
 328         for master in root.iter():
 329             if master.tag in self.master_tags:
 330                 return master
 331
 332     def index_content(self, book, book_fields=[]):
 333         """
 334         Walks the book XML and extract content from it.
 335         Adds parts for each header tag and for each fragment.
 336         """
 337         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 338         root = wld.edoc.getroot()
 339
 340         master = self.get_master(root)
 341         if master is None:
 342             return []
 343
 344         def walker(node):
 345             yield node, None
 346             for child in list(node):
 347                 for b, e in walker(child):
 348                     yield b, e
 349             yield None, node
 350             return
 351
 352         def fix_format(text):
 353             return re.sub("(?m)/$", "", text)
 354
 355         def add_part(snippets, **fields):
 356             doc = self.create_book_doc(book)
 357             for f in book_fields:
 358                 doc.add(f)
 359
 360             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 361             doc.add(NumericField("header_span", Field.Store.YES, True)\
 362                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 363             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 364
 365             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 366                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 367
 368             snip_pos = snippets.add(fields["content"])
 369             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 370             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 371
 372             if 'fragment_anchor' in fields:
 373                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 374                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 375
 376             if 'themes' in fields:
 377                 themes, themes_pl = zip(*[
 378                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 379                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 380                      for theme in fields['themes']])
 381
 382                 themes = self.add_gaps(themes, 'themes')
 383                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 384
 385                 for t in themes:
 386                     doc.add(t)
 387                 for t in themes_pl:
 388                     doc.add(t)
 389
 390             return doc
 391
 392         def give_me_utf8(s):
 393             if isinstance(s, unicode):
 394                 return s.encode('utf-8')
 395             else:
 396                 return s
 397
 398         fragments = {}
 399         snippets = Snippets(book.id).open('w')
 400         try:
 401             for header, position in zip(list(master), range(len(master))):
 402
 403                 if header.tag in self.skip_header_tags:
 404                     continue
 405                 if header.tag is etree.Comment:
 406                     continue
 407
 408                 # section content
 409                 content = []
 410
 411                 for start, end in walker(header):
 412                         # handle fragments and themes.
 413                     if start is not None and start.tag == 'begin':
 414                         fid = start.attrib['id'][1:]
 415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 416
 417                     elif start is not None and start.tag == 'motyw':
 418                         fid = start.attrib['id'][1:]
 419                         if start.text is not None:
 420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 421
 422                     elif start is not None and start.tag == 'end':
 423                         fid = start.attrib['id'][1:]
 424                         if fid not in fragments:
 425                             continue  # a broken <end> node, skip it
 426                                       #                        import pdb; pdb.set_trace()
 427                         frag = fragments[fid]
 428                         if frag['themes'] == []:
 429                             continue  # empty themes list.
 430                         del fragments[fid]
 431
 432                         def jstr(l):
 433                             return u' '.join(map(
 434                                 lambda x: x == None and u'(none)' or unicode(x),
 435                                 l))
 436
 437                         doc = add_part(snippets,
 438                                        header_type=frag['start_header'],
 439                                        header_index=frag['start_section'],
 440                                        header_span=position - frag['start_section'] + 1,
 441                                        fragment_anchor=fid,
 442                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 443                                        themes=frag['themes'])
 444
 445                         self.index.addDocument(doc)
 446
 447                         # Collect content.
 448                     elif start is not None:
 449                         for frag in fragments.values():
 450                             frag['content'].append(start.text)
 451                         content.append(start.text)
 452                     elif end is not None:
 453                         for frag in fragments.values():
 454                             frag['content'].append(end.tail)
 455                         content.append(end.tail)
 456
 457                         # in the end, add a section text.
 458                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 459                                content=fix_format(u' '.join(filter(lambda s: s is not None, content))))
 460
 461                 self.index.addDocument(doc)
 462
 463         finally:
 464             snippets.close()
 465
 466
 467 def log_exception_wrapper(f):
 468     def _wrap(*a):
 469         try:
 470             f(*a)
 471         except Exception, e:
 472             print("Error in indexing thread: %s" % e)
 473             traceback.print_exc()
 474             raise e
 475     return _wrap
 476
 477
 478 class ReusableIndex(Index):
 479     """
 480     Works like index, but does not close/optimize Lucene index
 481     until program exit (uses atexit hook).
 482     This is usefull for importbooks command.
 483
 484     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 485     """
 486     index = None
 487
 488     def open(self, analyzer=None, threads=4):
 489         if ReusableIndex.index is not None:
 490             self.index = ReusableIndex.index
 491         else:
 492             print("opening index")
 493             Index.open(self, analyzer)
 494             ReusableIndex.index = self.index
 495             atexit.register(ReusableIndex.close_reusable)
 496
 497     # def index_book(self, *args, **kw):
 498     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 499     #     ReusableIndex.pool_jobs.append(job)
 500
 501     @staticmethod
 502     def close_reusable():
 503         if ReusableIndex.index is not None:
 504             ReusableIndex.index.optimize()
 505             ReusableIndex.index.close()
 506             ReusableIndex.index = None
 507
 508     def close(self):
 509         pass
 510
 511
 512 class JoinSearch(object):
 513     """
 514     This mixin could be used to handle block join queries.
 515     (currently unused)
 516     """
 517     def __init__(self, *args, **kw):
 518         super(JoinSearch, self).__init__(*args, **kw)
 519
 520     def wrapjoins(self, query, fields=[]):
 521         """
 522         This functions modifies the query in a recursive way,
 523         so Term and Phrase Queries contained, which match
 524         provided fields are wrapped in a BlockJoinQuery,
 525         and so delegated to children documents.
 526         """
 527         if BooleanQuery.instance_(query):
 528             qs = BooleanQuery.cast_(query)
 529             for clause in qs:
 530                 clause = BooleanClause.cast_(clause)
 531                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 532             return qs
 533         else:
 534             termset = HashSet()
 535             query.extractTerms(termset)
 536             for t in termset:
 537                 t = Term.cast_(t)
 538                 if t.field() not in fields:
 539                     return query
 540             return BlockJoinQuery(query, self.parent_filter,
 541                                   BlockJoinQuery.ScoreMode.Total)
 542
 543     def bsearch(self, query, max_results=50):
 544         q = self.query(query)
 545         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 546
 547         tops = self.searcher.search(bjq, max_results)
 548         bks = []
 549         for found in tops.scoreDocs:
 550             doc = self.searcher.doc(found.doc)
 551             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 552         return (bks, tops.totalHits)
 553
 554
 555 class SearchResult(object):
 556     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 557         if tokens_cache is None: tokens_cache = {}
 558
 559         if score:
 560             self._score = score
 561         else:
 562             self._score = scoreDocs.score
 563
 564         self.boost = 1.0
 565
 566         self._hits = []
 567         self.hits = None  # processed hits
 568
 569         stored = searcher.doc(scoreDocs.doc)
 570         self.book_id = int(stored.get("book_id"))
 571
 572         header_type = stored.get("header_type")
 573         if not header_type:
 574             return
 575
 576         sec = (header_type, int(stored.get("header_index")))
 577         header_span = stored.get('header_span')
 578         header_span = header_span is not None and int(header_span) or 1
 579
 580         fragment = stored.get("fragment_anchor")
 581
 582         if snippets:
 583             snippets = snippets.replace("/\n", "\n")
 584         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 585
 586         self._hits.append(hit)
 587
 588         self.searcher = searcher
 589         self.searched = searched
 590         self.tokens_cache = tokens_cache
 591
 592     @property
 593     def score(self):
 594         return self._score * self.boost
 595
 596     def merge(self, other):
 597         if self.book_id != other.book_id:
 598             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 599         self._hits += other._hits
 600         if other.score > self.score:
 601             self.score = other.score
 602         return self
 603
 604     def get_book(self):
 605         return catalogue.models.Book.objects.get(id=self.book_id)
 606
 607     book = property(get_book)
 608
 609     def process_hits(self):
 610         POSITION = 0
 611         FRAGMENT = 1
 612         POSITION_INDEX = 1
 613         POSITION_SPAN = 2
 614         SCORE = 2
 615         OTHER = 3
 616
 617         # to sections and fragments
 618         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 619         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 620         sect = filter(lambda s: 0 == len(filter(
 621             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 622             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 623             frags)), sect)
 624
 625         hits = []
 626
 627         # remove duplicate fragments
 628         fragments = {}
 629         for f in frags:
 630             fid = f[FRAGMENT]
 631             if fid in fragments:
 632                 if fragments[fid][SCORE] >= f[SCORE]:
 633                     continue
 634             fragments[fid] = f
 635         frags = fragments.values()
 636
 637         # remove duplicate sections
 638         sections = {}
 639
 640         for s in sect:
 641             si = s[POSITION][POSITION_INDEX]
 642             # skip existing
 643             if si in sections:
 644                 if sections[si]['score'] >= s[SCORE]:
 645                     continue
 646
 647             m = {'score': s[SCORE],
 648                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 649                  }
 650             m.update(s[OTHER])
 651             sections[si] = m
 652
 653         hits = sections.values()
 654
 655         for f in frags:
 656             try:
 657                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 658             except catalogue.models.Fragment.DoesNotExist:
 659                 # stale index
 660                 continue
 661
 662             # Figure out if we were searching for a token matching some word in theme name.
 663             themes = frag.tags.filter(category='theme')
 664             themes_hit = []
 665             if self.searched is not None:
 666                 tokens = self.searcher.get_tokens(self.searched, 'POLISH', tokens_cache=self.tokens_cache)
 667                 for theme in themes:
 668                     name_tokens = self.searcher.get_tokens(theme.name, 'POLISH')
 669                     for t in tokens:
 670                         if name_tokens.index(t):
 671                             if not theme in themes_hit:
 672                                 themes_hit.append(theme)
 673                             break
 674
 675             m = {'score': f[SCORE],
 676                  'fragment': frag,
 677                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 678                  'themes': themes,
 679                  'themes_hit': themes_hit
 680                  }
 681             m.update(f[OTHER])
 682             hits.append(m)
 683
 684         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 685
 686         self.hits = hits
 687
 688         return self
 689
 690     def __unicode__(self):
 691         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 692
 693     @staticmethod
 694     def aggregate(*result_lists):
 695         books = {}
 696         for rl in result_lists:
 697             for r in rl:
 698                 if r.book_id in books:
 699                     books[r.book_id].merge(r)
 700                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 701                 else:
 702                     books[r.book_id] = r
 703         return books.values()
 704
 705     def __cmp__(self, other):
 706         return cmp(self.score, other.score)
 707
 708
 709 class Hint(object):
 710     """
 711     Given some hint information (information we already know about)
 712     our search target - like author, title (specific book), epoch, genre, kind
 713     we can narrow down search using filters.
 714     """
 715     def __init__(self, search):
 716         """
 717         Accepts a Searcher instance.
 718         """
 719         self.search = search
 720         self.book_tags = {}
 721         self.part_tags = []
 722         self._books = []
 723
 724     def books(self, *books):
 725         """
 726         Give a hint that we search these books.
 727         """
 728         self._books = books
 729
 730     def tags(self, tags):
 731         """
 732         Give a hint that these Tag objects (a list of)
 733         is necessary.
 734         """
 735         for t in tags:
 736             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 737                 lst = self.book_tags.get(t.category, [])
 738                 lst.append(t)
 739                 self.book_tags[t.category] = lst
 740             if t.category in ['theme', 'theme_pl']:
 741                 self.part_tags.append(t)
 742
 743     def tag_filter(self, tags, field='tags'):
 744         """
 745         Given a lsit of tags and an optional field (but they are normally in tags field)
 746         returns a filter accepting only books with specific tags.
 747         """
 748         q = BooleanQuery()
 749
 750         for tag in tags:
 751             toks = self.search.get_tokens(tag.name, field=field)
 752             tag_phrase = PhraseQuery()
 753             for tok in toks:
 754                 tag_phrase.add(Term(field, tok))
 755             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 756
 757         return QueryWrapperFilter(q)
 758
 759     def book_filter(self):
 760         """
 761         Filters using book tags (all tag kinds except a theme)
 762         """
 763         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 764         if tags:
 765             return self.tag_filter(tags)
 766         else:
 767             return None
 768
 769     def part_filter(self):
 770         """
 771         This filter can be used to look for book parts.
 772         It filters on book id and/or themes.
 773         """
 774         fs = []
 775         if self.part_tags:
 776             fs.append(self.tag_filter(self.part_tags, field='themes'))
 777
 778         if self._books != []:
 779             bf = BooleanFilter()
 780             for b in self._books:
 781                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 782                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 783             fs.append(bf)
 784
 785         return Search.chain_filters(fs)
 786
 787     def should_search_for_book(self):
 788         return self._books == []
 789
 790     def just_search_in(self, all):
 791         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 792         some = []
 793         for field in all:
 794             if field == 'authors' and 'author' in self.book_tags:
 795                 continue
 796             if field == 'title' and self._books != []:
 797                 continue
 798             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 799                 continue
 800             some.append(field)
 801         return some
 802
 803
 804 class Search(IndexStore):
 805     """
 806     Search facilities.
 807     """
 808     def __init__(self, default_field="content"):
 809         IndexStore.__init__(self)
 810         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 811         # self.analyzer = WLAnalyzer()
 812         self.searcher = IndexSearcher(self.store, True)
 813         self.parser = QueryParser(Version.LUCENE_34, default_field,
 814                                   self.analyzer)
 815
 816         self.parent_filter = TermsFilter()
 817         self.parent_filter.addTerm(Term("is_book", "true"))
 818
 819     def query(self, query):
 820         """Parse query in default Lucene Syntax. (for humans)
 821         """
 822         return self.parser.parse(query)
 823
 824     def simple_search(self, query, max_results=50):
 825         """Runs a query for books using lucene syntax. (for humans)
 826         Returns (books, total_hits)
 827         """
 828
 829         tops = self.searcher.search(self.query(query), max_results)
 830         bks = []
 831         for found in tops.scoreDocs:
 832             doc = self.searcher.doc(found.doc)
 833             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 834         return (bks, tops.totalHits)
 835
 836     def get_tokens(self, searched, field='content', cached=None):
 837         """returns tokens analyzed by a proper (for a field) analyzer
 838         argument can be: StringReader, string/unicode, or tokens. In the last case
 839         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 840         """
 841         if cached is not None and field in cached:
 842             return cached[field]
 843
 844         if isinstance(searched, str) or isinstance(searched, unicode):
 845             searched = StringReader(searched)
 846         elif isinstance(searched, list):
 847             return searched
 848
 849         searched.reset()
 850         tokens = self.analyzer.reusableTokenStream(field, searched)
 851         toks = []
 852         while tokens.incrementToken():
 853             cta = tokens.getAttribute(CharTermAttribute.class_)
 854             toks.append(cta.toString())
 855
 856         if cached is not None:
 857             cached[field] = toks
 858
 859         return toks
 860
 861     def fuzziness(self, fuzzy):
 862         """Helper method to sanitize fuzziness"""
 863         if not fuzzy:
 864             return None
 865         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 866             return fuzzy
 867         else:
 868             return 0.5
 869
 870     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 871         """
 872         Return a PhraseQuery with a series of tokens.
 873         """
 874         if fuzzy:
 875             phrase = MultiPhraseQuery()
 876             for t in tokens:
 877                 term = Term(field, t)
 878                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 879                 fuzzterms = []
 880
 881                 while True:
 882                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 883                     ft = fuzzterm.term()
 884                     if ft:
 885                         fuzzterms.append(ft)
 886                     if not fuzzterm.next(): break
 887                 if fuzzterms:
 888                     phrase.add(JArray('object')(fuzzterms, Term))
 889                 else:
 890                     phrase.add(term)
 891         else:
 892             phrase = PhraseQuery()
 893             phrase.setSlop(slop)
 894             for t in tokens:
 895                 term = Term(field, t)
 896                 phrase.add(term)
 897         return phrase
 898
 899     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 900         """
 901         Returns term queries joined by boolean query.
 902         modal - applies to boolean query
 903         fuzzy - should the query by fuzzy.
 904         """
 905         q = BooleanQuery()
 906         for t in tokens:
 907             term = Term(field, t)
 908             if fuzzy:
 909                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 910             else:
 911                 term = TermQuery(term)
 912             q.add(BooleanClause(term, modal))
 913         return q
 914
 915     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 916                       filters=None, tokens_cache=None, boost=None):
 917         if filters is None: filters = []
 918         if tokens_cache is None: tokens_cache = {}
 919
 920         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 921
 922         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy)
 923         if book:
 924             filters.append(self.term_filter(Term('is_book', 'true')))
 925         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 926
 927         return [SearchResult(self.searcher, found) for found in top.scoreDocs]
 928
 929     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 930                     filters=None, tokens_cache=None, boost=None):
 931         if filters is None: filters = []
 932         if tokens_cache is None: tokens_cache = {}
 933
 934         if book:
 935             filters.append(self.term_filter(Term('is_book', 'true')))
 936
 937         query = BooleanQuery()
 938
 939         for fld in fields:
 940             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 941
 942             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 943                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 944
 945         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 946
 947         return [SearchResult(self.searcher, found, searched=searched, tokens_cache=tokens_cache) for found in top.scoreDocs]
 948
 949     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 950         """
 951         Search for perfect book matches. Just see if the query matches with some author or title,
 952         taking hints into account.
 953         """
 954         fields_to_search = ['authors', 'title']
 955         only_in = None
 956         if hint:
 957             if not hint.should_search_for_book():
 958                 return []
 959             fields_to_search = hint.just_search_in(fields_to_search)
 960             only_in = hint.book_filter()
 961
 962         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 963
 964         books = []
 965         for q in qrys:
 966             top = self.searcher.search(q,
 967                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 968                 max_results)
 969             for found in top.scoreDocs:
 970                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
 971         return books
 972
 973     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 974         fields_to_search = ['tags', 'authors', 'title']
 975
 976         only_in = None
 977         if hint:
 978             if not hint.should_search_for_book():
 979                 return []
 980             fields_to_search = hint.just_search_in(fields_to_search)
 981             only_in = hint.book_filter()
 982
 983         tokens = self.get_tokens(searched, field='SIMPLE')
 984
 985         q = BooleanQuery()
 986
 987         for fld in fields_to_search:
 988             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 989                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 990
 991         books = []
 992         top = self.searcher.search(q,
 993                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 994             max_results)
 995         for found in top.scoreDocs:
 996             books.append(SearchResult(self.searcher, found, how_found="search_book"))
 997
 998         return books
 999
1000     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1001         """
1002         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1003         some part/fragment of the book.
1004         """
1005         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1006
1007         flt = None
1008         if hint:
1009             flt = hint.part_filter()
1010
1011         books = []
1012         for q in qrys:
1013             top = self.searcher.search(q,
1014                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1015                                                            flt]),
1016                                        max_results)
1017             for found in top.scoreDocs:
1018                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1019
1020         return books
1021
1022     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1023         """
1024         Tries to use search terms to match different fields of book (or its parts).
1025         E.g. one word can be an author survey, another be a part of the title, and the rest
1026         are some words from third chapter.
1027         """
1028         if tokens_cache is None: tokens_cache = {}
1029         books = []
1030         only_in = None
1031
1032         if hint:
1033             only_in = hint.part_filter()
1034
1035         # content only query : themes x content
1036         q = BooleanQuery()
1037
1038         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1039         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1040
1041         # only search in themes when we do not already filter by themes
1042         if hint is None or hint.just_search_in(['themes']) != []:
1043             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1044                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1045
1046         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1047                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1048
1049         topDocs = self.searcher.search(q, only_in, max_results)
1050         for found in topDocs.scoreDocs:
1051             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
1052             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1053
1054         # query themes/content x author/title/tags
1055         q = BooleanQuery()
1056         in_content = BooleanQuery()
1057         in_meta = BooleanQuery()
1058
1059         for fld in ['themes_pl', 'content']:
1060             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1061
1062         for fld in ['tags', 'authors', 'title']:
1063             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1064
1065         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1066         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1067
1068         topDocs = self.searcher.search(q, only_in, max_results)
1069         for found in topDocs.scoreDocs:
1070             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
1071             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1072
1073         return books
1074
1075     # def multisearch(self, query, max_results=50):
1076     #     """
1077     #     Search strategy:
1078     #     - (phrase) OR -> content
1079     #                   -> title
1080     #                   -> authors
1081     #     - (keywords)  -> authors
1082     #                   -> motyw
1083     #                   -> tags
1084     #                   -> content
1085     #     """
1086         # queryreader = StringReader(query)
1087         # tokens = self.get_tokens(queryreader)
1088
1089         # top_level = BooleanQuery()
1090         # Should = BooleanClause.Occur.SHOULD
1091
1092         # phrase_level = BooleanQuery()
1093         # phrase_level.setBoost(1.3)
1094
1095         # p_content = self.make_phrase(tokens, joined=True)
1096         # p_title = self.make_phrase(tokens, 'title')
1097         # p_author = self.make_phrase(tokens, 'author')
1098
1099         # phrase_level.add(BooleanClause(p_content, Should))
1100         # phrase_level.add(BooleanClause(p_title, Should))
1101         # phrase_level.add(BooleanClause(p_author, Should))
1102
1103         # kw_level = BooleanQuery()
1104
1105         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1106         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1107         # kw_level.add(j_themes, Should)
1108         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1109         # j_con = self.make_term_query(tokens, joined=True)
1110         # kw_level.add(j_con, Should)
1111
1112         # top_level.add(BooleanClause(phrase_level, Should))
1113         # top_level.add(BooleanClause(kw_level, Should))
1114
1115         # return None
1116
1117     def get_snippets(self, scoreDoc, query, field='content'):
1118         """
1119         Returns a snippet for found scoreDoc.
1120         """
1121         htmlFormatter = SimpleHTMLFormatter()
1122         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1123
1124         stored = self.searcher.doc(scoreDoc.doc)
1125
1126         # locate content.
1127         snippets = Snippets(stored.get('book_id')).open()
1128         try:
1129             text = snippets.get((int(stored.get('snippets_position')),
1130                                  int(stored.get('snippets_length'))))
1131         finally:
1132             snippets.close()
1133
1134         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1135         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1136         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1137
1138         return snip
1139
1140     @staticmethod
1141     def enum_to_array(enum):
1142         """
1143         Converts a lucene TermEnum to array of Terms, suitable for
1144         addition to queries
1145         """
1146         terms = []
1147
1148         while True:
1149             t = enum.term()
1150             if t:
1151                 terms.append(t)
1152             if not enum.next(): break
1153
1154         if terms:
1155             return JArray('object')(terms, Term)
1156
1157     def search_tags(self, query, filter=None, max_results=40):
1158         """
1159         Search for Tag objects using query.
1160         """
1161         tops = self.searcher.search(query, filter, max_results)
1162
1163         tags = []
1164         for found in tops.scoreDocs:
1165             doc = self.searcher.doc(found.doc)
1166             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1167             tags.append(tag)
1168             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1169
1170         return tags
1171
1172     def search_books(self, query, filter=None, max_results=10):
1173         """
1174         Searches for Book objects using query
1175         """
1176         bks = []
1177         tops = self.searcher.search(query, filter, max_results)
1178         for found in tops.scoreDocs:
1179             doc = self.searcher.doc(found.doc)
1180             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1181         return bks
1182
1183     def create_prefix_phrase(self, toks, field):
1184         q = MultiPhraseQuery()
1185         for i in range(len(toks)):
1186             t = Term(field, toks[i])
1187             if i == len(toks) - 1:
1188                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1189                 if pterms:
1190                     q.add(pterms)
1191                 else:
1192                     q.add(t)
1193             else:
1194                 q.add(t)
1195         return q
1196
1197     @staticmethod
1198     def term_filter(term, inverse=False):
1199         only_term = TermsFilter()
1200         only_term.addTerm(term)
1201
1202         if inverse:
1203             neg = BooleanFilter()
1204             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1205             only_term = neg
1206
1207         return only_term
1208
1209     def hint_tags(self, string, max_results=50):
1210         """
1211         Return auto-complete hints for tags
1212         using prefix search.
1213         """
1214         toks = self.get_tokens(string, field='SIMPLE')
1215         top = BooleanQuery()
1216
1217         for field in ['tag_name', 'tag_name_pl']:
1218             q = self.create_prefix_phrase(toks, field)
1219             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1220
1221         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1222
1223         return self.search_tags(top, no_book_cat, max_results=max_results)
1224
1225     def hint_books(self, string, max_results=50):
1226         """
1227         Returns auto-complete hints for book titles
1228         Because we do not index 'pseudo' title-tags.
1229         Prefix search.
1230         """
1231         toks = self.get_tokens(string, field='SIMPLE')
1232
1233         q = self.create_prefix_phrase(toks, 'title')
1234
1235         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1236
1237     @staticmethod
1238     def chain_filters(filters, op=ChainedFilter.AND):
1239         """
1240         Chains a filter list together
1241         """
1242         filters = filter(lambda x: x is not None, filters)
1243         if not filters or filters is []:
1244             return None
1245         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1246         return chf
1247
1248     def filtered_categories(self, tags):
1249         """
1250         Return a list of tag categories, present in tags list.
1251         """
1252         cats = {}
1253         for t in tags:
1254             cats[t.category] = True
1255         return cats.keys()
1256
1257     def hint(self):
1258         return Hint(self)