apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  31 from multiprocessing.pool import ThreadPool
  32 from threading import current_thread
  33 import atexit
  34 import traceback
  35
  36
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
  38     def __init__(self):
  39         polish = PolishAnalyzer(Version.LUCENE_34)
  40         #        polish_gap.setPositionIncrementGap(999)
  41
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
  43         #        simple_gap.setPositionIncrementGap(999)
  44
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
  46
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  48
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
  50
  51         self.addAnalyzer("tags", simple)
  52         self.addAnalyzer("technical_editors", simple)
  53         self.addAnalyzer("editors", simple)
  54         self.addAnalyzer("url", keyword)
  55         self.addAnalyzer("source_url", keyword)
  56         self.addAnalyzer("source_name", simple)
  57         self.addAnalyzer("publisher", simple)
  58         self.addAnalyzer("authors", simple)
  59         self.addAnalyzer("title", simple)
  60
  61         self.addAnalyzer("is_book", keyword)
  62         # shouldn't the title have two forms? _pl and simple?
  63
  64         self.addAnalyzer("themes", simple)
  65         self.addAnalyzer("themes_pl", polish)
  66
  67         self.addAnalyzer("tag_name", simple)
  68         self.addAnalyzer("tag_name_pl", polish)
  69
  70         self.addAnalyzer("translators", simple)
  71
  72         self.addAnalyzer("KEYWORD", keyword)
  73         self.addAnalyzer("SIMPLE", simple)
  74         self.addAnalyzer("POLISH", polish)
  75
  76
  77 class IndexStore(object):
  78     """
  79     Provides access to search index.
  80
  81     self.store - lucene index directory
  82     """
  83     def __init__(self):
  84         self.make_index_dir()
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  86
  87     def make_index_dir(self):
  88         try:
  89             os.makedirs(settings.SEARCH_INDEX)
  90         except OSError as exc:
  91             if exc.errno == errno.EEXIST:
  92                 pass
  93             else: raise
  94
  95
  96 class IndexChecker(IndexStore):
  97     def __init__(self):
  98         IndexStore.__init__(self)
  99
 100     def check(self):
 101         checker = CheckIndex(self.store)
 102         status = checker.checkIndex()
 103         return status
 104
 105
 106 class Snippets(object):
 107     """
 108     This class manages snippet files for indexed object (book)
 109     the snippets are concatenated together, and their positions and
 110     lengths are kept in lucene index fields.
 111     """
 112     SNIPPET_DIR = "snippets"
 113
 114     def __init__(self, book_id):
 115         try:
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 117         except OSError as exc:
 118             if exc.errno == errno.EEXIST:
 119                 pass
 120             else: raise
 121         self.book_id = book_id
 122         self.file = None
 123
 124     def open(self, mode='r'):
 125         """
 126         Open the snippet file. Call .close() afterwards.
 127         """
 128         if not 'b' in mode:
 129             mode += 'b'
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 131         self.position = 0
 132         return self
 133
 134     def add(self, snippet):
 135         """
 136         Append a snippet (unicode) to the snippet file.
 137         Return a (position, length) tuple
 138         """
 139         txt = snippet.encode('utf-8')
 140         l = len(txt)
 141         self.file.write(txt)
 142         pos = (self.position, l)
 143         self.position += l
 144         return pos
 145
 146     def get(self, pos):
 147         """
 148         Given a tuple of (position, length) return an unicode
 149         of the snippet stored there.
 150         """
 151         self.file.seek(pos[0], 0)
 152         txt = self.file.read(pos[1]).decode('utf-8')
 153         return txt
 154
 155     def close(self):
 156         """Close snippet file"""
 157         self.file.close()
 158
 159
 160 class BaseIndex(IndexStore):
 161     """
 162     Base index class.
 163     Provides basic operations on index: opening, closing, optimizing.
 164     """
 165     def __init__(self, analyzer=None):
 166         super(BaseIndex, self).__init__()
 167         self.index = None
 168         if not analyzer:
 169             analyzer = WLAnalyzer()
 170         self.analyzer = analyzer
 171
 172     def open(self, timeout=None):
 173         if self.index:
 174             raise Exception("Index is already opened")
 175         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 176         if timeout:
 177             conf.setWriteLockTimeout(long(timeout))
 178         self.index = IndexWriter(self.store, conf)
 179         return self.index
 180
 181     def optimize(self):
 182         self.index.optimize()
 183
 184     def close(self):
 185         try:
 186             self.index.optimize()
 187         except JavaError, je:
 188             print "Error during optimize phase, check index: %s" % je
 189
 190         self.index.close()
 191         self.index = None
 192
 193     def __enter__(self):
 194         self.open()
 195         return self
 196
 197     def __exit__(self, type, value, tb):
 198         self.close()
 199
 200
 201 class Index(BaseIndex):
 202     """
 203     Class indexing books.
 204     """
 205     def __init__(self, analyzer=None):
 206         super(Index, self).__init__(analyzer)
 207
 208     def index_tags(self):
 209         """
 210         Re-index global tag list.
 211         Removes all tags from index, then index them again.
 212         Indexed fields include: id, name (with and without polish stems), category
 213         """
 214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 215         self.index.deleteDocuments(q)
 216
 217         for tag in catalogue.models.Tag.objects.all():
 218             doc = Document()
 219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 223             self.index.addDocument(doc)
 224
 225         for pdtag in PDCounterAuthor.objects.all():
 226             doc = Document()
 227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 230             doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 232             self.index.addDocument(doc)
 233
 234         for pdtag in PDCounterBook.objects.all():
 235             doc = Document()
 236             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 237             print pdtag.title
 238             doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
 239             doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
 240             doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 241             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 242             self.index.addDocument(doc)
 243
 244     def create_book_doc(self, book):
 245         """
 246         Create a lucene document referring book id.
 247         """
 248         doc = Document()
 249         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 250         if book.parent is not None:
 251             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 252         return doc
 253
 254     def remove_book(self, book):
 255         """Removes a book from search index.
 256         book - Book instance."""
 257         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 258         self.index.deleteDocuments(q)
 259
 260     def index_book(self, book, book_info=None, overwrite=True):
 261         """
 262         Indexes the book.
 263         Creates a lucene document for extracted metadata
 264         and calls self.index_content() to index the contents of the book.
 265         """
 266         if overwrite:
 267             self.remove_book(book)
 268
 269         book_doc = self.create_book_doc(book)
 270         meta_fields = self.extract_metadata(book, book_info)
 271         for f in meta_fields.values():
 272             if isinstance(f, list) or isinstance(f, tuple):
 273                 for elem in f:
 274                     book_doc.add(elem)
 275             else:
 276                 book_doc.add(f)
 277         self.index.addDocument(book_doc)
 278         del book_doc
 279
 280         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 281
 282     master_tags = [
 283         'opowiadanie',
 284         'powiesc',
 285         'dramat_wierszowany_l',
 286         'dramat_wierszowany_lp',
 287         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 288         'wywiad',
 289         ]
 290
 291     ignore_content_tags = [
 292         'uwaga', 'extra',
 293         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 294         'didaskalia',
 295         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 296         ]
 297
 298     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 299
 300     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 301
 302     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 303
 304     def extract_metadata(self, book, book_info=None):
 305         """
 306         Extract metadata from book and returns a map of fields keyed by fieldname
 307         """
 308         fields = {}
 309
 310         if book_info is None:
 311             book_info = dcparser.parse(open(book.xml_file.path))
 312
 313         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 314         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 315         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 316
 317         # validator, name
 318         for field in dcparser.BookInfo.FIELDS:
 319             if hasattr(book_info, field.name):
 320                 if not getattr(book_info, field.name):
 321                     continue
 322                 # since no type information is available, we use validator
 323                 type_indicator = field.validator
 324                 if type_indicator == dcparser.as_unicode:
 325                     s = getattr(book_info, field.name)
 326                     if field.multiple:
 327                         s = ', '.join(s)
 328                     try:
 329                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 330                     except JavaError as je:
 331                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 332                 elif type_indicator == dcparser.as_person:
 333                     p = getattr(book_info, field.name)
 334                     if isinstance(p, dcparser.Person):
 335                         persons = unicode(p)
 336                     else:
 337                         persons = ', '.join(map(unicode, p))
 338                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 339                 elif type_indicator == dcparser.as_date:
 340                     dt = getattr(book_info, field.name)
 341                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 342                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 343
 344         # get published date
 345         pd = None
 346         if hasattr(book_info, 'source_name') and book_info.source_name:
 347             match = self.published_date_re.search(book_info.source_name)
 348             if match is not None:
 349                 pd = str(match.groups()[0])
 350         if not pd: pd = ""
 351         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 352
 353         return fields
 354
 355     def add_gaps(self, fields, fieldname):
 356         """
 357         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 358         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 359         """
 360         def gap():
 361             while True:
 362                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 363         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 364
 365     def get_master(self, root):
 366         """
 367         Returns the first master tag from an etree.
 368         """
 369         for master in root.iter():
 370             if master.tag in self.master_tags:
 371                 return master
 372
 373     def index_content(self, book, book_fields=[]):
 374         """
 375         Walks the book XML and extract content from it.
 376         Adds parts for each header tag and for each fragment.
 377         """
 378         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 379         root = wld.edoc.getroot()
 380
 381         master = self.get_master(root)
 382         if master is None:
 383             return []
 384
 385         def walker(node, ignore_tags=[]):
 386
 387             if node.tag not in ignore_tags:
 388                 yield node, None, None
 389                 if node.text is not None:
 390                     yield None, node.text, None
 391                 for child in list(node):
 392                     for b, t, e in walker(child):
 393                         yield b, t, e
 394                 yield None, None, node
 395
 396             if node.tail is not None:
 397                 yield None, node.tail, None
 398             return
 399
 400         def fix_format(text):
 401             #            separator = [u" ", u"\t", u".", u";", u","]
 402             if isinstance(text, list):
 403                 # need to join it first
 404                 text = filter(lambda s: s is not None, content)
 405                 text = u' '.join(text)
 406                 # for i in range(len(text)):
 407                 #     if i > 0:
 408                 #         if text[i][0] not in separator\
 409                 #             and text[i - 1][-1] not in separator:
 410                 #          text.insert(i, u" ")
 411
 412             return re.sub("(?m)/$", "", text)
 413
 414         def add_part(snippets, **fields):
 415             doc = self.create_book_doc(book)
 416             for f in book_fields:
 417                 doc.add(f)
 418
 419             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 420             doc.add(NumericField("header_span", Field.Store.YES, True)\
 421                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 422             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 423
 424             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 425                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 426
 427             snip_pos = snippets.add(fields["content"])
 428             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 429             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 430
 431             if 'fragment_anchor' in fields:
 432                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 433                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 434
 435             if 'themes' in fields:
 436                 themes, themes_pl = zip(*[
 437                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 438                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 439                      for theme in fields['themes']])
 440
 441                 themes = self.add_gaps(themes, 'themes')
 442                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 443
 444                 for t in themes:
 445                     doc.add(t)
 446                 for t in themes_pl:
 447                     doc.add(t)
 448
 449             return doc
 450
 451         def give_me_utf8(s):
 452             if isinstance(s, unicode):
 453                 return s.encode('utf-8')
 454             else:
 455                 return s
 456
 457         fragments = {}
 458         snippets = Snippets(book.id).open('w')
 459         try:
 460             for header, position in zip(list(master), range(len(master))):
 461
 462                 if header.tag in self.skip_header_tags:
 463                     continue
 464                 if header.tag is etree.Comment:
 465                     continue
 466
 467                 # section content
 468                 content = []
 469                 footnote = []
 470
 471                 def all_content(text):
 472                     for frag in fragments.values():
 473                         frag['content'].append(text)
 474                     content.append(text)
 475                 handle_text = [all_content]
 476
 477
 478                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 479                     # handle footnotes
 480                     if start is not None and start.tag in self.footnote_tags:
 481                         footnote = []
 482                         def collect_footnote(t):
 483                             footnote.append(t)
 484                         handle_text.append(collect_footnote)
 485                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 486                         handle_text.pop()
 487                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 488                                        content=u''.join(footnote),
 489                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 490
 491                         self.index.addDocument(doc)
 492                         #print "@ footnote text: %s" % footnote
 493                         footnote = []
 494
 495                     # handle fragments and themes.
 496                     if start is not None and start.tag == 'begin':
 497                         fid = start.attrib['id'][1:]
 498                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 499
 500                     # themes for this fragment
 501                     elif start is not None and start.tag == 'motyw':
 502                         fid = start.attrib['id'][1:]
 503                         handle_text.append(None)
 504                         if start.text is not None:
 505                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 506                     elif end is not None and end.tag == 'motyw':
 507                         handle_text.pop()
 508
 509                     elif start is not None and start.tag == 'end':
 510                         fid = start.attrib['id'][1:]
 511                         if fid not in fragments:
 512                             continue  # a broken <end> node, skip it
 513                         frag = fragments[fid]
 514                         if frag['themes'] == []:
 515                             continue  # empty themes list.
 516                         del fragments[fid]
 517
 518                         doc = add_part(snippets,
 519                                        header_type=frag['start_header'],
 520                                        header_index=frag['start_section'],
 521                                        header_span=position - frag['start_section'] + 1,
 522                                        fragment_anchor=fid,
 523                                        content=fix_format(frag['content']),
 524                                        themes=frag['themes'])
 525                         #print '@ FRAG %s' % frag['content']
 526                         self.index.addDocument(doc)
 527
 528                         # Collect content.
 529
 530                     if text is not None and handle_text is not []:
 531                         hdl = handle_text[-1]
 532                         if hdl is not None:
 533                             hdl(text)
 534
 535                         # in the end, add a section text.
 536                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 537                                content=fix_format(content))
 538                 #print '@ CONTENT: %s' % fix_format(content)
 539
 540                 self.index.addDocument(doc)
 541
 542         finally:
 543             snippets.close()
 544
 545
 546 def log_exception_wrapper(f):
 547     def _wrap(*a):
 548         try:
 549             f(*a)
 550         except Exception, e:
 551             print("Error in indexing thread: %s" % e)
 552             traceback.print_exc()
 553             raise e
 554     return _wrap
 555
 556
 557 class ReusableIndex(Index):
 558     """
 559     Works like index, but does not close/optimize Lucene index
 560     until program exit (uses atexit hook).
 561     This is usefull for importbooks command.
 562
 563     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 564     """
 565     index = None
 566
 567     def open(self, analyzer=None, **kw):
 568         if ReusableIndex.index:
 569             self.index = ReusableIndex.index
 570         else:
 571             print("opening index")
 572             Index.open(self, analyzer, **kw)
 573             ReusableIndex.index = self.index
 574             atexit.register(ReusableIndex.close_reusable)
 575
 576     # def index_book(self, *args, **kw):
 577     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 578     #     ReusableIndex.pool_jobs.append(job)
 579
 580     @staticmethod
 581     def close_reusable():
 582         if ReusableIndex.index:
 583             print("closing index")
 584             ReusableIndex.index.optimize()
 585             ReusableIndex.index.close()
 586             ReusableIndex.index = None
 587
 588     def close(self):
 589         if ReusableIndex.index:
 590             ReusableIndex.index.commit()
 591
 592
 593 class JoinSearch(object):
 594     """
 595     This mixin could be used to handle block join queries.
 596     (currently unused)
 597     """
 598     def __init__(self, *args, **kw):
 599         super(JoinSearch, self).__init__(*args, **kw)
 600
 601     def wrapjoins(self, query, fields=[]):
 602         """
 603         This functions modifies the query in a recursive way,
 604         so Term and Phrase Queries contained, which match
 605         provided fields are wrapped in a BlockJoinQuery,
 606         and so delegated to children documents.
 607         """
 608         if BooleanQuery.instance_(query):
 609             qs = BooleanQuery.cast_(query)
 610             for clause in qs:
 611                 clause = BooleanClause.cast_(clause)
 612                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 613             return qs
 614         else:
 615             termset = HashSet()
 616             query.extractTerms(termset)
 617             for t in termset:
 618                 t = Term.cast_(t)
 619                 if t.field() not in fields:
 620                     return query
 621             return BlockJoinQuery(query, self.parent_filter,
 622                                   BlockJoinQuery.ScoreMode.Total)
 623
 624     def bsearch(self, query, max_results=50):
 625         q = self.query(query)
 626         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 627
 628         tops = self.searcher.search(bjq, max_results)
 629         bks = []
 630         for found in tops.scoreDocs:
 631             doc = self.searcher.doc(found.doc)
 632             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 633         return (bks, tops.totalHits)
 634
 635
 636 class SearchResult(object):
 637     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 638         if tokens_cache is None: tokens_cache = {}
 639
 640         if score:
 641             self._score = score
 642         else:
 643             self._score = scoreDocs.score
 644
 645         self.boost = 1.0
 646
 647         self._hits = []
 648         self._processed_hits = None  # processed hits
 649
 650         stored = search.searcher.doc(scoreDocs.doc)
 651         self.book_id = int(stored.get("book_id"))
 652
 653         pd = stored.get("published_date")
 654         if pd is None:
 655             pd = 0
 656         self.published_date = int(pd)
 657
 658         header_type = stored.get("header_type")
 659         # we have a content hit in some header of fragment
 660         if header_type is not None:
 661             sec = (header_type, int(stored.get("header_index")))
 662             header_span = stored.get('header_span')
 663             header_span = header_span is not None and int(header_span) or 1
 664
 665             fragment = stored.get("fragment_anchor")
 666
 667             if snippets:
 668                 snippets = snippets.replace("/\n", "\n")
 669             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 670
 671             self._hits.append(hit)
 672
 673         self.search = search
 674         self.searched = searched
 675         self.tokens_cache = tokens_cache
 676
 677     @property
 678     def score(self):
 679         return self._score * self.boost
 680
 681     def merge(self, other):
 682         if self.book_id != other.book_id:
 683             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 684         self._hits += other._hits
 685         if other.score > self.score:
 686             self._score = other._score
 687         return self
 688
 689     def get_book(self):
 690         return catalogue.models.Book.objects.get(id=self.book_id)
 691
 692     book = property(get_book)
 693
 694     @property
 695     def hits(self):
 696         if self._processed_hits is not None:
 697             return self._processed_hits
 698
 699         POSITION = 0
 700         FRAGMENT = 1
 701         POSITION_INDEX = 1
 702         POSITION_SPAN = 2
 703         SCORE = 2
 704         OTHER = 3
 705
 706         # to sections and fragments
 707         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 708         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 709         sect = filter(lambda s: 0 == len(filter(
 710             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 711             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 712             frags)), sect)
 713
 714         hits = []
 715
 716         # remove duplicate fragments
 717         fragments = {}
 718         for f in frags:
 719             fid = f[FRAGMENT]
 720             if fid in fragments:
 721                 if fragments[fid][SCORE] >= f[SCORE]:
 722                     continue
 723             fragments[fid] = f
 724         frags = fragments.values()
 725
 726         # remove duplicate sections
 727         sections = {}
 728
 729         for s in sect:
 730             si = s[POSITION][POSITION_INDEX]
 731             # skip existing
 732             if si in sections:
 733                 if sections[si]['score'] >= s[SCORE]:
 734                     continue
 735
 736             m = {'score': s[SCORE],
 737                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 738                  }
 739             m.update(s[OTHER])
 740             sections[si] = m
 741
 742         hits = sections.values()
 743
 744         for f in frags:
 745             try:
 746                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 747             except catalogue.models.Fragment.DoesNotExist:
 748                 # stale index
 749                 continue
 750
 751             # Figure out if we were searching for a token matching some word in theme name.
 752             themes = frag.tags.filter(category='theme')
 753             themes_hit = []
 754             if self.searched is not None:
 755                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 756                 for theme in themes:
 757                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 758                     for t in tokens:
 759                         if t in name_tokens:
 760                             if not theme in themes_hit:
 761                                 themes_hit.append(theme)
 762                             break
 763
 764             m = {'score': f[SCORE],
 765                  'fragment': frag,
 766                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 767                  'themes': themes,
 768                  'themes_hit': themes_hit
 769                  }
 770             m.update(f[OTHER])
 771             hits.append(m)
 772
 773         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 774
 775         self._processed_hits = hits
 776
 777         return hits
 778
 779     def __unicode__(self):
 780         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 781
 782     @staticmethod
 783     def aggregate(*result_lists):
 784         books = {}
 785         for rl in result_lists:
 786             for r in rl:
 787                 if r.book_id in books:
 788                     books[r.book_id].merge(r)
 789                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 790                 else:
 791                     books[r.book_id] = r
 792         return books.values()
 793
 794     def __cmp__(self, other):
 795         c = cmp(self.score, other.score)
 796         if c == 0:
 797             # this is inverted, because earlier date is better
 798             return cmp(other.published_date, self.published_date)
 799         else:
 800             return c
 801
 802
 803 class Hint(object):
 804     """
 805     Given some hint information (information we already know about)
 806     our search target - like author, title (specific book), epoch, genre, kind
 807     we can narrow down search using filters.
 808     """
 809     def __init__(self, search):
 810         """
 811         Accepts a Searcher instance.
 812         """
 813         self.search = search
 814         self.book_tags = {}
 815         self.part_tags = []
 816         self._books = []
 817
 818     def books(self, *books):
 819         """
 820         Give a hint that we search these books.
 821         """
 822         self._books = books
 823
 824     def tags(self, tags):
 825         """
 826         Give a hint that these Tag objects (a list of)
 827         is necessary.
 828         """
 829         for t in tags:
 830             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 831                 lst = self.book_tags.get(t.category, [])
 832                 lst.append(t)
 833                 self.book_tags[t.category] = lst
 834             if t.category in ['theme', 'theme_pl']:
 835                 self.part_tags.append(t)
 836
 837     def tag_filter(self, tags, field='tags'):
 838         """
 839         Given a lsit of tags and an optional field (but they are normally in tags field)
 840         returns a filter accepting only books with specific tags.
 841         """
 842         q = BooleanQuery()
 843
 844         for tag in tags:
 845             toks = self.search.get_tokens(tag.name, field=field)
 846             tag_phrase = PhraseQuery()
 847             for tok in toks:
 848                 tag_phrase.add(Term(field, tok))
 849             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 850
 851         return QueryWrapperFilter(q)
 852
 853     def book_filter(self):
 854         """
 855         Filters using book tags (all tag kinds except a theme)
 856         """
 857         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 858         if tags:
 859             return self.tag_filter(tags)
 860         else:
 861             return None
 862
 863     def part_filter(self):
 864         """
 865         This filter can be used to look for book parts.
 866         It filters on book id and/or themes.
 867         """
 868         fs = []
 869         if self.part_tags:
 870             fs.append(self.tag_filter(self.part_tags, field='themes'))
 871
 872         if self._books != []:
 873             bf = BooleanFilter()
 874             for b in self._books:
 875                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 876                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 877             fs.append(bf)
 878
 879         return Search.chain_filters(fs)
 880
 881     def should_search_for_book(self):
 882         return self._books == []
 883
 884     def just_search_in(self, all):
 885         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 886         some = []
 887         for field in all:
 888             if field == 'authors' and 'author' in self.book_tags:
 889                 continue
 890             if field == 'title' and self._books != []:
 891                 continue
 892             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 893                 continue
 894             some.append(field)
 895         return some
 896
 897
 898 class Search(IndexStore):
 899     """
 900     Search facilities.
 901     """
 902     def __init__(self, default_field="content"):
 903         IndexStore.__init__(self)
 904         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 905         # self.analyzer = WLAnalyzer()
 906         self.searcher = IndexSearcher(self.store, True)
 907         self.parser = QueryParser(Version.LUCENE_34, default_field,
 908                                   self.analyzer)
 909
 910         self.parent_filter = TermsFilter()
 911         self.parent_filter.addTerm(Term("is_book", "true"))
 912
 913     def query(self, query):
 914         """Parse query in default Lucene Syntax. (for humans)
 915         """
 916         return self.parser.parse(query)
 917
 918     def simple_search(self, query, max_results=50):
 919         """Runs a query for books using lucene syntax. (for humans)
 920         Returns (books, total_hits)
 921         """
 922
 923         tops = self.searcher.search(self.query(query), max_results)
 924         bks = []
 925         for found in tops.scoreDocs:
 926             doc = self.searcher.doc(found.doc)
 927             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 928         return (bks, tops.totalHits)
 929
 930     def get_tokens(self, searched, field='content', cached=None):
 931         """returns tokens analyzed by a proper (for a field) analyzer
 932         argument can be: StringReader, string/unicode, or tokens. In the last case
 933         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 934         """
 935         if cached is not None and field in cached:
 936             return cached[field]
 937
 938         if isinstance(searched, str) or isinstance(searched, unicode):
 939             searched = StringReader(searched)
 940         elif isinstance(searched, list):
 941             return searched
 942
 943         searched.reset()
 944         tokens = self.analyzer.reusableTokenStream(field, searched)
 945         toks = []
 946         while tokens.incrementToken():
 947             cta = tokens.getAttribute(CharTermAttribute.class_)
 948             toks.append(cta.toString())
 949
 950         if cached is not None:
 951             cached[field] = toks
 952
 953         return toks
 954
 955     def fuzziness(self, fuzzy):
 956         """Helper method to sanitize fuzziness"""
 957         if not fuzzy:
 958             return None
 959         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 960             return fuzzy
 961         else:
 962             return 0.5
 963
 964     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 965         """
 966         Return a PhraseQuery with a series of tokens.
 967         """
 968         if fuzzy:
 969             phrase = MultiPhraseQuery()
 970             for t in tokens:
 971                 term = Term(field, t)
 972                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 973                 fuzzterms = []
 974
 975                 while True:
 976                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 977                     ft = fuzzterm.term()
 978                     if ft:
 979                         fuzzterms.append(ft)
 980                     if not fuzzterm.next(): break
 981                 if fuzzterms:
 982                     phrase.add(JArray('object')(fuzzterms, Term))
 983                 else:
 984                     phrase.add(term)
 985         else:
 986             phrase = PhraseQuery()
 987             phrase.setSlop(slop)
 988             for t in tokens:
 989                 term = Term(field, t)
 990                 phrase.add(term)
 991         return phrase
 992
 993     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 994         """
 995         Returns term queries joined by boolean query.
 996         modal - applies to boolean query
 997         fuzzy - should the query by fuzzy.
 998         """
 999         q = BooleanQuery()
1000         for t in tokens:
1001             term = Term(field, t)
1002             if fuzzy:
1003                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1004             else:
1005                 term = TermQuery(term)
1006             q.add(BooleanClause(term, modal))
1007         return q
1008
1009     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1010                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1011         if filters is None: filters = []
1012         if tokens_cache is None: tokens_cache = {}
1013
1014         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1015
1016         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1017         if book:
1018             filters.append(self.term_filter(Term('is_book', 'true')))
1019         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1020
1021         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1022
1023     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1024                     filters=None, tokens_cache=None, boost=None, snippets=True):
1025         if filters is None: filters = []
1026         if tokens_cache is None: tokens_cache = {}
1027
1028         if book:
1029             filters.append(self.term_filter(Term('is_book', 'true')))
1030
1031         query = BooleanQuery()
1032
1033         for fld in fields:
1034             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1035
1036             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1037                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1038
1039         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1040
1041         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1042                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1043
1044     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1045         """
1046         Search for perfect book matches. Just see if the query matches with some author or title,
1047         taking hints into account.
1048         """
1049         fields_to_search = ['authors', 'title']
1050         only_in = None
1051         if hint:
1052             if not hint.should_search_for_book():
1053                 return []
1054             fields_to_search = hint.just_search_in(fields_to_search)
1055             only_in = hint.book_filter()
1056
1057         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1058
1059         books = []
1060         for q in qrys:
1061             top = self.searcher.search(q,
1062                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1063                 max_results)
1064             for found in top.scoreDocs:
1065                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1066         return books
1067
1068     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1069         fields_to_search = ['tags', 'authors', 'title']
1070
1071         only_in = None
1072         if hint:
1073             if not hint.should_search_for_book():
1074                 return []
1075             fields_to_search = hint.just_search_in(fields_to_search)
1076             only_in = hint.book_filter()
1077
1078         tokens = self.get_tokens(searched, field='SIMPLE')
1079
1080         q = BooleanQuery()
1081
1082         for fld in fields_to_search:
1083             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1084                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1085
1086         books = []
1087         top = self.searcher.search(q,
1088                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1089             max_results)
1090         for found in top.scoreDocs:
1091             books.append(SearchResult(self, found, how_found="search_book"))
1092
1093         return books
1094
1095     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1096         """
1097         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1098         some part/fragment of the book.
1099         """
1100         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1101
1102         flt = None
1103         if hint:
1104             flt = hint.part_filter()
1105
1106         books = []
1107         for q in qrys:
1108             top = self.searcher.search(q,
1109                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1110                                                            flt]),
1111                                        max_results)
1112             for found in top.scoreDocs:
1113                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1114
1115         return books
1116
1117     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1118         """
1119         Tries to use search terms to match different fields of book (or its parts).
1120         E.g. one word can be an author survey, another be a part of the title, and the rest
1121         are some words from third chapter.
1122         """
1123         if tokens_cache is None: tokens_cache = {}
1124         books = []
1125         only_in = None
1126
1127         if hint:
1128             only_in = hint.part_filter()
1129
1130         # content only query : themes x content
1131         q = BooleanQuery()
1132
1133         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1134         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1135
1136         # only search in themes when we do not already filter by themes
1137         if hint is None or hint.just_search_in(['themes']) != []:
1138             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1139                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1140
1141         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1142                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1143
1144         topDocs = self.searcher.search(q, only_in, max_results)
1145         for found in topDocs.scoreDocs:
1146             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1147             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1148
1149         # query themes/content x author/title/tags
1150         q = BooleanQuery()
1151         in_content = BooleanQuery()
1152         in_meta = BooleanQuery()
1153
1154         for fld in ['themes_pl', 'content']:
1155             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1156
1157         for fld in ['tags', 'authors', 'title']:
1158             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1159
1160         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1161         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1162
1163         topDocs = self.searcher.search(q, only_in, max_results)
1164         for found in topDocs.scoreDocs:
1165             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1166             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1167
1168         return books
1169
1170     # def multisearch(self, query, max_results=50):
1171     #     """
1172     #     Search strategy:
1173     #     - (phrase) OR -> content
1174     #                   -> title
1175     #                   -> authors
1176     #     - (keywords)  -> authors
1177     #                   -> motyw
1178     #                   -> tags
1179     #                   -> content
1180     #     """
1181         # queryreader = StringReader(query)
1182         # tokens = self.get_tokens(queryreader)
1183
1184         # top_level = BooleanQuery()
1185         # Should = BooleanClause.Occur.SHOULD
1186
1187         # phrase_level = BooleanQuery()
1188         # phrase_level.setBoost(1.3)
1189
1190         # p_content = self.make_phrase(tokens, joined=True)
1191         # p_title = self.make_phrase(tokens, 'title')
1192         # p_author = self.make_phrase(tokens, 'author')
1193
1194         # phrase_level.add(BooleanClause(p_content, Should))
1195         # phrase_level.add(BooleanClause(p_title, Should))
1196         # phrase_level.add(BooleanClause(p_author, Should))
1197
1198         # kw_level = BooleanQuery()
1199
1200         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1201         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1202         # kw_level.add(j_themes, Should)
1203         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1204         # j_con = self.make_term_query(tokens, joined=True)
1205         # kw_level.add(j_con, Should)
1206
1207         # top_level.add(BooleanClause(phrase_level, Should))
1208         # top_level.add(BooleanClause(kw_level, Should))
1209
1210         # return None
1211
1212     def get_snippets(self, scoreDoc, query, field='content'):
1213         """
1214         Returns a snippet for found scoreDoc.
1215         """
1216         htmlFormatter = SimpleHTMLFormatter()
1217         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1218
1219         stored = self.searcher.doc(scoreDoc.doc)
1220
1221         position = stored.get('snippets_position')
1222         length = stored.get('snippets_length')
1223         if position is None or length is None:
1224             return None
1225         # locate content.
1226         book_id = int(stored.get('book_id'))
1227         snippets = Snippets(book_id).open()
1228         try:
1229             try:
1230                 text = snippets.get((int(position),
1231                                      int(length)))
1232             finally:
1233                 snippets.close()
1234
1235             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1236             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1237             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1238
1239         except Exception, e:
1240             e2 = e
1241             if hasattr(e, 'getJavaException'):
1242                 e2 = unicode(e.getJavaException())
1243             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1244                 e2)
1245         return snip
1246
1247     @staticmethod
1248     def enum_to_array(enum):
1249         """
1250         Converts a lucene TermEnum to array of Terms, suitable for
1251         addition to queries
1252         """
1253         terms = []
1254
1255         while True:
1256             t = enum.term()
1257             if t:
1258                 terms.append(t)
1259             if not enum.next(): break
1260
1261         if terms:
1262             return JArray('object')(terms, Term)
1263
1264     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1265         """
1266         Search for Tag objects using query.
1267         """
1268         if not pdcounter:
1269             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1270         tops = self.searcher.search(query, filters, max_results)
1271
1272         tags = []
1273         for found in tops.scoreDocs:
1274             doc = self.searcher.doc(found.doc)
1275             is_pdcounter = doc.get('is_pdcounter')
1276             category = doc.get('tag_category')
1277             if is_pdcounter == 'true':
1278                 if category == 'pd_author':
1279                     tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1280                 elif category == 'pd_book':
1281                     tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1282                     tag.category = 'pd_book'  # make it look more lik a tag.
1283                 else:
1284                     print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1285             else:
1286                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1287                 # don't add the pdcounter tag if same tag already exists
1288             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1289                 tags.append(tag)
1290                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1291         print 'returning %s' % tags
1292         return tags
1293
1294     def search_books(self, query, filter=None, max_results=10):
1295         """
1296         Searches for Book objects using query
1297         """
1298         bks = []
1299         tops = self.searcher.search(query, filter, max_results)
1300         for found in tops.scoreDocs:
1301             doc = self.searcher.doc(found.doc)
1302             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1303         return bks
1304
1305     def make_prefix_phrase(self, toks, field):
1306         q = MultiPhraseQuery()
1307         for i in range(len(toks)):
1308             t = Term(field, toks[i])
1309             if i == len(toks) - 1:
1310                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1311                 if pterms:
1312                     q.add(pterms)
1313                 else:
1314                     q.add(t)
1315             else:
1316                 q.add(t)
1317         return q
1318
1319     @staticmethod
1320     def term_filter(term, inverse=False):
1321         only_term = TermsFilter()
1322         only_term.addTerm(term)
1323
1324         if inverse:
1325             neg = BooleanFilter()
1326             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1327             only_term = neg
1328
1329         return only_term
1330
1331     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1332         """
1333         Return auto-complete hints for tags
1334         using prefix search.
1335         """
1336         toks = self.get_tokens(string, field='SIMPLE')
1337         top = BooleanQuery()
1338
1339         for field in ['tag_name', 'tag_name_pl']:
1340             if prefix:
1341                 q = self.make_prefix_phrase(toks, field)
1342             else:
1343                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1344             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1345
1346         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1347
1348         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1349
1350     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1351         """
1352         Returns auto-complete hints for book titles
1353         Because we do not index 'pseudo' title-tags.
1354         Prefix search.
1355         """
1356         toks = self.get_tokens(string, field='SIMPLE')
1357
1358         if prefix:
1359             q = self.make_prefix_phrase(toks, 'title')
1360         else:
1361             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1362
1363         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1364
1365     @staticmethod
1366     def chain_filters(filters, op=ChainedFilter.AND):
1367         """
1368         Chains a filter list together
1369         """
1370         filters = filter(lambda x: x is not None, filters)
1371         if not filters or filters is []:
1372             return None
1373         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1374         return chf
1375
1376     def filtered_categories(self, tags):
1377         """
1378         Return a list of tag categories, present in tags list.
1379         """
1380         cats = {}
1381         for t in tags:
1382             cats[t.category] = True
1383         return cats.keys()
1384
1385     def hint(self):
1386         return Hint(self)