apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from django.dispatch import Signal
   5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
   6     File, Field, Integer, \
   7     NumericField, Version, Document, JavaError, IndexSearcher, \
   8     QueryParser, PerFieldAnalyzerWrapper, \
   9     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
  10     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  11     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  12     HashSet, BooleanClause, Term, CharTermAttribute, \
  13     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  14     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  15     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  16     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  17     initVM, CLASSPATH, JArray, JavaError
  18     # KeywordAnalyzer
  19
  20 # Initialize jvm
  21 JVM = initVM(CLASSPATH)
  22
  23 import sys
  24 import os
  25 import re
  26 import errno
  27 from librarian import dcparser
  28 from librarian.parser import WLDocument
  29 from lxml import etree
  30 import catalogue.models
  31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  32 from multiprocessing.pool import ThreadPool
  33 from threading import current_thread
  34 from itertools import chain
  35 import atexit
  36 import traceback
  37 import logging
  38 log = logging.getLogger('search')
  39
  40 class WLAnalyzer(PerFieldAnalyzerWrapper):
  41     def __init__(self):
  42         polish = PolishAnalyzer(Version.LUCENE_34)
  43         #        polish_gap.setPositionIncrementGap(999)
  44
  45         simple = SimpleAnalyzer(Version.LUCENE_34)
  46         #        simple_gap.setPositionIncrementGap(999)
  47
  48         keyword = KeywordAnalyzer(Version.LUCENE_34)
  49
  50         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  51
  52         PerFieldAnalyzerWrapper.__init__(self, polish)
  53
  54         self.addAnalyzer("tags", simple)
  55         self.addAnalyzer("technical_editors", simple)
  56         self.addAnalyzer("editors", simple)
  57         self.addAnalyzer("url", keyword)
  58         self.addAnalyzer("source_url", keyword)
  59         self.addAnalyzer("source_name", simple)
  60         self.addAnalyzer("publisher", simple)
  61         self.addAnalyzer("authors", simple)
  62         self.addAnalyzer("title", simple)
  63
  64         self.addAnalyzer("is_book", keyword)
  65         # shouldn't the title have two forms? _pl and simple?
  66
  67         self.addAnalyzer("themes", simple)
  68         self.addAnalyzer("themes_pl", polish)
  69
  70         self.addAnalyzer("tag_name", simple)
  71         self.addAnalyzer("tag_name_pl", polish)
  72
  73         self.addAnalyzer("translators", simple)
  74
  75         self.addAnalyzer("KEYWORD", keyword)
  76         self.addAnalyzer("SIMPLE", simple)
  77         self.addAnalyzer("POLISH", polish)
  78
  79
  80 class IndexStore(object):
  81     """
  82     Provides access to search index.
  83
  84     self.store - lucene index directory
  85     """
  86     def __init__(self):
  87         self.make_index_dir()
  88         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
  89
  90     def make_index_dir(self):
  91         try:
  92             os.makedirs(settings.SEARCH_INDEX)
  93         except OSError as exc:
  94             if exc.errno == errno.EEXIST:
  95                 pass
  96             else: raise
  97
  98     def close(self):
  99         self.store.close()
 100
 101
 102 class IndexChecker(IndexStore):
 103     def __init__(self):
 104         IndexStore.__init__(self)
 105
 106     def check(self):
 107         checker = CheckIndex(self.store)
 108         status = checker.checkIndex()
 109         return status
 110
 111
 112 class Snippets(object):
 113     """
 114     This class manages snippet files for indexed object (book)
 115     the snippets are concatenated together, and their positions and
 116     lengths are kept in lucene index fields.
 117     """
 118     SNIPPET_DIR = "snippets"
 119
 120     def __init__(self, book_id, revision=None):
 121         try:
 122             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 123         except OSError as exc:
 124             if exc.errno == errno.EEXIST:
 125                 pass
 126             else: raise
 127         self.book_id = book_id
 128         self.revision = revision
 129         self.file = None
 130
 131     @property
 132     def path(self):
 133         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
 134         else: fn = "%d" % self.book_id
 135
 136         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 137
 138     def open(self, mode='r'):
 139         """
 140         Open the snippet file. Call .close() afterwards.
 141         """
 142         if not 'b' in mode:
 143             mode += 'b'
 144
 145         if 'w' in mode:
 146             if os.path.exists(self.path):
 147                 self.revision = 1
 148                 while True:
 149                     if not os.path.exists(self.path):
 150                         break
 151                     self.revision += 1
 152
 153         self.file = open(self.path, mode)
 154         self.position = 0
 155         return self
 156
 157     def add(self, snippet):
 158         """
 159         Append a snippet (unicode) to the snippet file.
 160         Return a (position, length) tuple
 161         """
 162         txt = snippet.encode('utf-8')
 163         l = len(txt)
 164         self.file.write(txt)
 165         pos = (self.position, l)
 166         self.position += l
 167         return pos
 168
 169     def get(self, pos):
 170         """
 171         Given a tuple of (position, length) return an unicode
 172         of the snippet stored there.
 173         """
 174         self.file.seek(pos[0], 0)
 175         txt = self.file.read(pos[1]).decode('utf-8')
 176         return txt
 177
 178     def close(self):
 179         """Close snippet file"""
 180         self.file.close()
 181
 182     def remove(self):
 183         self.revision = None
 184         try:
 185             os.unlink(self.path)
 186             self.revision = 0
 187             while True:
 188                 self.revision += 1
 189                 os.unlink(self.path)
 190         except OSError:
 191             pass
 192
 193
 194 class BaseIndex(IndexStore):
 195     """
 196     Base index class.
 197     Provides basic operations on index: opening, closing, optimizing.
 198     """
 199     def __init__(self, analyzer=None):
 200         super(BaseIndex, self).__init__()
 201         self.index = None
 202         if not analyzer:
 203             analyzer = WLAnalyzer()
 204         self.analyzer = analyzer
 205
 206     def open(self, timeout=None):
 207         if self.index:
 208             raise Exception("Index is already opened")
 209         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 210         if timeout:
 211             conf.setWriteLockTimeout(long(timeout))
 212         self.index = IndexWriter(self.store, conf)
 213         return self.index
 214
 215     def optimize(self):
 216         self.index.optimize()
 217
 218     def close(self):
 219         try:
 220             self.index.optimize()
 221         except JavaError, je:
 222             log.error("Error during optimize phase, check index: %s" % je)
 223
 224         self.index.close()
 225         self.index = None
 226
 227         index_changed.send_robust(self)
 228
 229         super(BaseIndex, self).close()
 230
 231     def __enter__(self):
 232         self.open()
 233         return self
 234
 235     def __exit__(self, type, value, tb):
 236         self.close()
 237
 238
 239 index_changed = Signal()
 240
 241
 242 class Index(BaseIndex):
 243     """
 244     Class indexing books.
 245     """
 246     def __init__(self, analyzer=None):
 247         super(Index, self).__init__(analyzer)
 248
 249     def index_tags(self, *tags, **kw):
 250         """
 251         Re-index global tag list.
 252         Removes all tags from index, then index them again.
 253         Indexed fields include: id, name (with and without polish stems), category
 254         """
 255         remove_only = kw.get('remove_only', False)
 256         # first, remove tags from index.
 257         if tags:
 258             q = BooleanQuery()
 259             for tag in tags:
 260                 b_id_cat = BooleanQuery()
 261
 262                 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
 263                 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
 264
 265                 if isinstance(tag, PDCounterAuthor):
 266                     q_cat = TermQuery(Term('tag_category', 'pd_author'))
 267                 elif isinstance(tag, PDCounterBook):
 268                     q_cat = TermQuery(Term('tag_category', 'pd_book'))
 269                 else:
 270                     q_cat = TermQuery(Term('tag_category', tag.category))
 271                 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
 272
 273                 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
 274         else:  # all
 275             q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 276             self.index.deleteDocuments(q)
 277
 278         if not remove_only:
 279             # then add them [all or just one passed]
 280             if not tags:
 281                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 282                     PDCounterAuthor.objects.all(), \
 283                     PDCounterBook.objects.all())
 284
 285             for tag in tags:
 286                 if isinstance(tag, PDCounterAuthor):
 287                     doc = Document()
 288                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 289                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 290                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 291                     doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 292                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 293                     self.index.addDocument(doc)
 294                 elif isinstance(tag, PDCounterBook):
 295                     doc = Document()
 296                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 297                     doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 298                     doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 299                     doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 300                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 301                     self.index.addDocument(doc)
 302                 else:
 303                     doc = Document()
 304                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 305                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 306                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 307                     doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 308                     self.index.addDocument(doc)
 309
 310     def create_book_doc(self, book):
 311         """
 312         Create a lucene document referring book id.
 313         """
 314         doc = Document()
 315         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 316         if book.parent is not None:
 317             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 318         return doc
 319
 320     def remove_book(self, book, remove_snippets=True):
 321         """Removes a book from search index.
 322         book - Book instance."""
 323         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 324         self.index.deleteDocuments(q)
 325
 326         if remove_snippets:
 327             snippets = Snippets(book.id)
 328             snippets.remove()
 329
 330     def index_book(self, book, book_info=None, overwrite=True):
 331         """
 332         Indexes the book.
 333         Creates a lucene document for extracted metadata
 334         and calls self.index_content() to index the contents of the book.
 335         """
 336         if overwrite:
 337             # we don't remove snippets, since they might be still needed by
 338             # threads using not reopened index
 339             self.remove_book(book, remove_snippets=False)
 340
 341         book_doc = self.create_book_doc(book)
 342         meta_fields = self.extract_metadata(book, book_info)
 343         for f in meta_fields.values():
 344             if isinstance(f, list) or isinstance(f, tuple):
 345                 for elem in f:
 346                     book_doc.add(elem)
 347             else:
 348                 book_doc.add(f)
 349         self.index.addDocument(book_doc)
 350         del book_doc
 351
 352         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 353
 354     master_tags = [
 355         'opowiadanie',
 356         'powiesc',
 357         'dramat_wierszowany_l',
 358         'dramat_wierszowany_lp',
 359         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 360         'wywiad',
 361         ]
 362
 363     ignore_content_tags = [
 364         'uwaga', 'extra',
 365         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 366         'didaskalia',
 367         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 368         ]
 369
 370     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 371
 372     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 373
 374     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 375
 376     def extract_metadata(self, book, book_info=None):
 377         """
 378         Extract metadata from book and returns a map of fields keyed by fieldname
 379         """
 380         fields = {}
 381
 382         if book_info is None:
 383             book_info = dcparser.parse(open(book.xml_file.path))
 384
 385         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 386         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 387         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 388
 389         # validator, name
 390         for field in dcparser.BookInfo.FIELDS:
 391             if hasattr(book_info, field.name):
 392                 if not getattr(book_info, field.name):
 393                     continue
 394                 # since no type information is available, we use validator
 395                 type_indicator = field.validator
 396                 if type_indicator == dcparser.as_unicode:
 397                     s = getattr(book_info, field.name)
 398                     if field.multiple:
 399                         s = ', '.join(s)
 400                     try:
 401                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 402                     except JavaError as je:
 403                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 404                 elif type_indicator == dcparser.as_person:
 405                     p = getattr(book_info, field.name)
 406                     if isinstance(p, dcparser.Person):
 407                         persons = unicode(p)
 408                     else:
 409                         persons = ', '.join(map(unicode, p))
 410                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 411                 elif type_indicator == dcparser.as_date:
 412                     dt = getattr(book_info, field.name)
 413                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 414                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 415
 416         # get published date
 417         pd = None
 418         if hasattr(book_info, 'source_name') and book_info.source_name:
 419             match = self.published_date_re.search(book_info.source_name)
 420             if match is not None:
 421                 pd = str(match.groups()[0])
 422         if not pd: pd = ""
 423         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 424
 425         return fields
 426
 427     def add_gaps(self, fields, fieldname):
 428         """
 429         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 430         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 431         """
 432         def gap():
 433             while True:
 434                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 435         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 436
 437     def get_master(self, root):
 438         """
 439         Returns the first master tag from an etree.
 440         """
 441         for master in root.iter():
 442             if master.tag in self.master_tags:
 443                 return master
 444
 445     def index_content(self, book, book_fields=[]):
 446         """
 447         Walks the book XML and extract content from it.
 448         Adds parts for each header tag and for each fragment.
 449         """
 450         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 451         root = wld.edoc.getroot()
 452
 453         master = self.get_master(root)
 454         if master is None:
 455             return []
 456
 457         def walker(node, ignore_tags=[]):
 458
 459             if node.tag not in ignore_tags:
 460                 yield node, None, None
 461                 if node.text is not None:
 462                     yield None, node.text, None
 463                 for child in list(node):
 464                     for b, t, e in walker(child):
 465                         yield b, t, e
 466                 yield None, None, node
 467
 468             if node.tail is not None:
 469                 yield None, node.tail, None
 470             return
 471
 472         def fix_format(text):
 473             #            separator = [u" ", u"\t", u".", u";", u","]
 474             if isinstance(text, list):
 475                 # need to join it first
 476                 text = filter(lambda s: s is not None, content)
 477                 text = u' '.join(text)
 478                 # for i in range(len(text)):
 479                 #     if i > 0:
 480                 #         if text[i][0] not in separator\
 481                 #             and text[i - 1][-1] not in separator:
 482                 #          text.insert(i, u" ")
 483
 484             return re.sub("(?m)/$", "", text)
 485
 486         def add_part(snippets, **fields):
 487             doc = self.create_book_doc(book)
 488             for f in book_fields:
 489                 doc.add(f)
 490
 491             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 492             doc.add(NumericField("header_span", Field.Store.YES, True)\
 493                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 494             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 495
 496             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 497                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 498
 499             snip_pos = snippets.add(fields["content"])
 500             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 501             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 502             if snippets.revision:
 503                 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
 504
 505             if 'fragment_anchor' in fields:
 506                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 507                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 508
 509             if 'themes' in fields:
 510                 themes, themes_pl = zip(*[
 511                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 512                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 513                      for theme in fields['themes']])
 514
 515                 themes = self.add_gaps(themes, 'themes')
 516                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 517
 518                 for t in themes:
 519                     doc.add(t)
 520                 for t in themes_pl:
 521                     doc.add(t)
 522
 523             return doc
 524
 525         def give_me_utf8(s):
 526             if isinstance(s, unicode):
 527                 return s.encode('utf-8')
 528             else:
 529                 return s
 530
 531         fragments = {}
 532         snippets = Snippets(book.id).open('w')
 533         try:
 534             for header, position in zip(list(master), range(len(master))):
 535
 536                 if header.tag in self.skip_header_tags:
 537                     continue
 538                 if header.tag is etree.Comment:
 539                     continue
 540
 541                 # section content
 542                 content = []
 543                 footnote = []
 544
 545                 def all_content(text):
 546                     for frag in fragments.values():
 547                         frag['content'].append(text)
 548                     content.append(text)
 549                 handle_text = [all_content]
 550
 551
 552                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 553                     # handle footnotes
 554                     if start is not None and start.tag in self.footnote_tags:
 555                         footnote = []
 556                         def collect_footnote(t):
 557                             footnote.append(t)
 558                         handle_text.append(collect_footnote)
 559                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 560                         handle_text.pop()
 561                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 562                                        content=u''.join(footnote),
 563                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 564
 565                         self.index.addDocument(doc)
 566                         #print "@ footnote text: %s" % footnote
 567                         footnote = []
 568
 569                     # handle fragments and themes.
 570                     if start is not None and start.tag == 'begin':
 571                         fid = start.attrib['id'][1:]
 572                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 573
 574                     # themes for this fragment
 575                     elif start is not None and start.tag == 'motyw':
 576                         fid = start.attrib['id'][1:]
 577                         handle_text.append(None)
 578                         if start.text is not None:
 579                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 580                     elif end is not None and end.tag == 'motyw':
 581                         handle_text.pop()
 582
 583                     elif start is not None and start.tag == 'end':
 584                         fid = start.attrib['id'][1:]
 585                         if fid not in fragments:
 586                             continue  # a broken <end> node, skip it
 587                         frag = fragments[fid]
 588                         if frag['themes'] == []:
 589                             continue  # empty themes list.
 590                         del fragments[fid]
 591
 592                         doc = add_part(snippets,
 593                                        header_type=frag['start_header'],
 594                                        header_index=frag['start_section'],
 595                                        header_span=position - frag['start_section'] + 1,
 596                                        fragment_anchor=fid,
 597                                        content=fix_format(frag['content']),
 598                                        themes=frag['themes'])
 599                         #print '@ FRAG %s' % frag['content']
 600                         self.index.addDocument(doc)
 601
 602                         # Collect content.
 603
 604                     if text is not None and handle_text is not []:
 605                         hdl = handle_text[-1]
 606                         if hdl is not None:
 607                             hdl(text)
 608
 609                         # in the end, add a section text.
 610                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 611                                content=fix_format(content))
 612                 #print '@ CONTENT: %s' % fix_format(content)
 613
 614                 self.index.addDocument(doc)
 615
 616         finally:
 617             snippets.close()
 618
 619
 620 def log_exception_wrapper(f):
 621     def _wrap(*a):
 622         try:
 623             f(*a)
 624         except Exception, e:
 625             log.error("Error in indexing thread: %s" % e)
 626             traceback.print_exc()
 627             raise e
 628     return _wrap
 629
 630
 631 class ReusableIndex(Index):
 632     """
 633     Works like index, but does not close/optimize Lucene index
 634     until program exit (uses atexit hook).
 635     This is usefull for importbooks command.
 636
 637     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 638     """
 639     index = None
 640
 641     def open(self, analyzer=None, **kw):
 642         if ReusableIndex.index:
 643             self.index = ReusableIndex.index
 644         else:
 645             Index.open(self, analyzer, **kw)
 646             ReusableIndex.index = self.index
 647             atexit.register(ReusableIndex.close_reusable)
 648
 649     # def index_book(self, *args, **kw):
 650     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 651     #     ReusableIndex.pool_jobs.append(job)
 652
 653     @staticmethod
 654     def close_reusable():
 655         if ReusableIndex.index:
 656             ReusableIndex.index.optimize()
 657             ReusableIndex.index.close()
 658             ReusableIndex.index = None
 659
 660             index_changed.send_robust(None)
 661
 662     def close(self):
 663         if ReusableIndex.index:
 664             ReusableIndex.index.commit()
 665
 666
 667 class JoinSearch(object):
 668     """
 669     This mixin could be used to handle block join queries.
 670     (currently unused)
 671     """
 672     def __init__(self, *args, **kw):
 673         super(JoinSearch, self).__init__(*args, **kw)
 674
 675     def wrapjoins(self, query, fields=[]):
 676         """
 677         This functions modifies the query in a recursive way,
 678         so Term and Phrase Queries contained, which match
 679         provided fields are wrapped in a BlockJoinQuery,
 680         and so delegated to children documents.
 681         """
 682         if BooleanQuery.instance_(query):
 683             qs = BooleanQuery.cast_(query)
 684             for clause in qs:
 685                 clause = BooleanClause.cast_(clause)
 686                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 687             return qs
 688         else:
 689             termset = HashSet()
 690             query.extractTerms(termset)
 691             for t in termset:
 692                 t = Term.cast_(t)
 693                 if t.field() not in fields:
 694                     return query
 695             return BlockJoinQuery(query, self.parent_filter,
 696                                   BlockJoinQuery.ScoreMode.Total)
 697
 698     def bsearch(self, query, max_results=50):
 699         q = self.query(query)
 700         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 701
 702         tops = self.searcher.search(bjq, max_results)
 703         bks = []
 704         for found in tops.scoreDocs:
 705             doc = self.searcher.doc(found.doc)
 706             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 707         return (bks, tops.totalHits)
 708
 709
 710 class SearchResult(object):
 711     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 712         if tokens_cache is None: tokens_cache = {}
 713
 714         if score:
 715             self._score = score
 716         else:
 717             self._score = scoreDocs.score
 718
 719         self.boost = 1.0
 720
 721         self._hits = []
 722         self._processed_hits = None  # processed hits
 723
 724         stored = search.searcher.doc(scoreDocs.doc)
 725         self.book_id = int(stored.get("book_id"))
 726
 727         pd = stored.get("published_date")
 728         try:
 729             self.published_date = int(pd)
 730         except ValueError:
 731             self.published_date = 0
 732
 733         header_type = stored.get("header_type")
 734         # we have a content hit in some header of fragment
 735         if header_type is not None:
 736             sec = (header_type, int(stored.get("header_index")))
 737             header_span = stored.get('header_span')
 738             header_span = header_span is not None and int(header_span) or 1
 739
 740             fragment = stored.get("fragment_anchor")
 741
 742             if snippets:
 743                 snippets = snippets.replace("/\n", "\n")
 744             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 745
 746             self._hits.append(hit)
 747
 748         self.search = search
 749         self.searched = searched
 750         self.tokens_cache = tokens_cache
 751
 752     @property
 753     def score(self):
 754         return self._score * self.boost
 755
 756     def merge(self, other):
 757         if self.book_id != other.book_id:
 758             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 759         self._hits += other._hits
 760         if other.score > self.score:
 761             self._score = other._score
 762         return self
 763
 764     def get_book(self):
 765         if hasattr(self, '_book'):
 766             return self._book
 767         return catalogue.models.Book.objects.get(id=self.book_id)
 768
 769     book = property(get_book)
 770
 771     @property
 772     def hits(self):
 773         if self._processed_hits is not None:
 774             return self._processed_hits
 775
 776         POSITION = 0
 777         FRAGMENT = 1
 778         POSITION_INDEX = 1
 779         POSITION_SPAN = 2
 780         SCORE = 2
 781         OTHER = 3
 782
 783         # to sections and fragments
 784         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 785
 786         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 787
 788         # sections not covered by fragments
 789         sect = filter(lambda s: 0 == len(filter(
 790             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 791             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 792             frags)), sect)
 793
 794         hits = []
 795
 796         def remove_duplicates(lst, keyfn, compare):
 797             els = {}
 798             for e in lst:
 799                 eif = keyfn(e)
 800                 if eif in els:
 801                     if compare(els[eif], e) >= 1:
 802                         continue
 803                 els[eif] = e
 804             return els.values()
 805
 806         # remove fragments with duplicated fid's and duplicated snippets
 807         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
 808         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
 809                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
 810
 811         # remove duplicate sections
 812         sections = {}
 813
 814         for s in sect:
 815             si = s[POSITION][POSITION_INDEX]
 816             # skip existing
 817             if si in sections:
 818                 if sections[si]['score'] >= s[SCORE]:
 819                     continue
 820
 821             m = {'score': s[SCORE],
 822                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 823                  }
 824             m.update(s[OTHER])
 825             sections[si] = m
 826
 827         hits = sections.values()
 828
 829         for f in frags:
 830             try:
 831                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 832             except catalogue.models.Fragment.DoesNotExist:
 833                 # stale index
 834                 continue
 835
 836             # Figure out if we were searching for a token matching some word in theme name.
 837             themes = frag.tags.filter(category='theme')
 838             themes_hit = []
 839             if self.searched is not None:
 840                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 841                 for theme in themes:
 842                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 843                     for t in tokens:
 844                         if t in name_tokens:
 845                             if not theme in themes_hit:
 846                                 themes_hit.append(theme)
 847                             break
 848
 849             m = {'score': f[SCORE],
 850                  'fragment': frag,
 851                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 852                  'themes': themes,
 853                  'themes_hit': themes_hit
 854                  }
 855             m.update(f[OTHER])
 856             hits.append(m)
 857
 858         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 859
 860         self._processed_hits = hits
 861
 862         return hits
 863
 864     def __unicode__(self):
 865         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 866
 867     @staticmethod
 868     def aggregate(*result_lists):
 869         books = {}
 870         for rl in result_lists:
 871             for r in rl:
 872                 if r.book_id in books:
 873                     books[r.book_id].merge(r)
 874                 else:
 875                     books[r.book_id] = r
 876         return books.values()
 877
 878     def __cmp__(self, other):
 879         c = cmp(self.score, other.score)
 880         if c == 0:
 881             # this is inverted, because earlier date is better
 882             return cmp(other.published_date, self.published_date)
 883         else:
 884             return c
 885
 886
 887 class Hint(object):
 888     """
 889     Given some hint information (information we already know about)
 890     our search target - like author, title (specific book), epoch, genre, kind
 891     we can narrow down search using filters.
 892     """
 893     def __init__(self, search):
 894         """
 895         Accepts a Searcher instance.
 896         """
 897         self.search = search
 898         self.book_tags = {}
 899         self.part_tags = []
 900         self._books = []
 901
 902     def books(self, *books):
 903         """
 904         Give a hint that we search these books.
 905         """
 906         self._books = books
 907
 908     def tags(self, tags):
 909         """
 910         Give a hint that these Tag objects (a list of)
 911         is necessary.
 912         """
 913         for t in tags:
 914             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 915                 lst = self.book_tags.get(t.category, [])
 916                 lst.append(t)
 917                 self.book_tags[t.category] = lst
 918             if t.category in ['theme', 'theme_pl']:
 919                 self.part_tags.append(t)
 920
 921     def tag_filter(self, tags, field='tags'):
 922         """
 923         Given a lsit of tags and an optional field (but they are normally in tags field)
 924         returns a filter accepting only books with specific tags.
 925         """
 926         q = BooleanQuery()
 927
 928         for tag in tags:
 929             toks = self.search.get_tokens(tag.name, field=field)
 930             tag_phrase = PhraseQuery()
 931             for tok in toks:
 932                 tag_phrase.add(Term(field, tok))
 933             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 934
 935         return QueryWrapperFilter(q)
 936
 937     def book_filter(self):
 938         """
 939         Filters using book tags (all tag kinds except a theme)
 940         """
 941         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 942         if tags:
 943             return self.tag_filter(tags)
 944         else:
 945             return None
 946
 947     def part_filter(self):
 948         """
 949         This filter can be used to look for book parts.
 950         It filters on book id and/or themes.
 951         """
 952         fs = []
 953         if self.part_tags:
 954             fs.append(self.tag_filter(self.part_tags, field='themes'))
 955
 956         if self._books != []:
 957             bf = BooleanFilter()
 958             for b in self._books:
 959                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 960                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 961             fs.append(bf)
 962
 963         return Search.chain_filters(fs)
 964
 965     def should_search_for_book(self):
 966         return self._books == []
 967
 968     def just_search_in(self, all):
 969         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 970         some = []
 971         for field in all:
 972             if field == 'authors' and 'author' in self.book_tags:
 973                 continue
 974             if field == 'title' and self._books != []:
 975                 continue
 976             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 977                 continue
 978             some.append(field)
 979         return some
 980
 981
 982 class Search(IndexStore):
 983     """
 984     Search facilities.
 985     """
 986     def __init__(self, default_field="content"):
 987         IndexStore.__init__(self)
 988         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 989         # self.analyzer = WLAnalyzer()
 990         reader = IndexReader.open(self.store, True)
 991         self.searcher = IndexSearcher(reader)
 992         self.parser = QueryParser(Version.LUCENE_34, default_field,
 993                                   self.analyzer)
 994
 995         self.parent_filter = TermsFilter()
 996         self.parent_filter.addTerm(Term("is_book", "true"))
 997         index_changed.connect(self.reopen)
 998
 999     def close(self):
1000         reader = self.searcher.getIndexReader()
1001         self.searcher.close()
1002         reader.close()
1003         super(Search, self).close()
1004         index_changed.disconnect(self.reopen)
1005
1006     def reopen(self, **unused):
1007         reader = self.searcher.getIndexReader()
1008         rdr = reader.reopen()
1009         if not rdr.equals(reader):
1010             log.debug('Reopening index')
1011             oldsearch = self.searcher
1012             self.searcher = IndexSearcher(rdr)
1013             oldsearch.close()
1014             reader.close()
1015
1016     def query(self, query):
1017         """Parse query in default Lucene Syntax. (for humans)
1018         """
1019         return self.parser.parse(query)
1020
1021     def simple_search(self, query, max_results=50):
1022         """Runs a query for books using lucene syntax. (for humans)
1023         Returns (books, total_hits)
1024         """
1025
1026         tops = self.searcher.search(self.query(query), max_results)
1027         bks = []
1028         for found in tops.scoreDocs:
1029             doc = self.searcher.doc(found.doc)
1030             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1031         return (bks, tops.totalHits)
1032
1033     def get_tokens(self, searched, field='content', cached=None):
1034         """returns tokens analyzed by a proper (for a field) analyzer
1035         argument can be: StringReader, string/unicode, or tokens. In the last case
1036         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1037         """
1038         if cached is not None and field in cached:
1039             return cached[field]
1040
1041         if isinstance(searched, str) or isinstance(searched, unicode):
1042             searched = StringReader(searched)
1043         elif isinstance(searched, list):
1044             return searched
1045
1046         searched.reset()
1047         tokens = self.analyzer.reusableTokenStream(field, searched)
1048         toks = []
1049         while tokens.incrementToken():
1050             cta = tokens.getAttribute(CharTermAttribute.class_)
1051             toks.append(cta.toString())
1052
1053         if cached is not None:
1054             cached[field] = toks
1055
1056         return toks
1057
1058     def fuzziness(self, fuzzy):
1059         """Helper method to sanitize fuzziness"""
1060         if not fuzzy:
1061             return None
1062         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1063             return fuzzy
1064         else:
1065             return 0.5
1066
1067     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1068         """
1069         Return a PhraseQuery with a series of tokens.
1070         """
1071         if fuzzy:
1072             phrase = MultiPhraseQuery()
1073             for t in tokens:
1074                 term = Term(field, t)
1075                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1076                 fuzzterms = []
1077
1078                 while True:
1079                     ft = fuzzterm.term()
1080                     if ft:
1081                         fuzzterms.append(ft)
1082                     if not fuzzterm.next(): break
1083                 if fuzzterms:
1084                     phrase.add(JArray('object')(fuzzterms, Term))
1085                 else:
1086                     phrase.add(term)
1087         else:
1088             phrase = PhraseQuery()
1089             phrase.setSlop(slop)
1090             for t in tokens:
1091                 term = Term(field, t)
1092                 phrase.add(term)
1093         return phrase
1094
1095     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1096         """
1097         Returns term queries joined by boolean query.
1098         modal - applies to boolean query
1099         fuzzy - should the query by fuzzy.
1100         """
1101         q = BooleanQuery()
1102         for t in tokens:
1103             term = Term(field, t)
1104             if fuzzy:
1105                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1106             else:
1107                 term = TermQuery(term)
1108             q.add(BooleanClause(term, modal))
1109         return q
1110
1111     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1112                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1113         if filters is None: filters = []
1114         if tokens_cache is None: tokens_cache = {}
1115
1116         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1117
1118         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1119         if book:
1120             filters.append(self.term_filter(Term('is_book', 'true')))
1121         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1122
1123         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1124
1125     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1126                     filters=None, tokens_cache=None, boost=None, snippets=True):
1127         if filters is None: filters = []
1128         if tokens_cache is None: tokens_cache = {}
1129
1130         if book:
1131             filters.append(self.term_filter(Term('is_book', 'true')))
1132
1133         query = BooleanQuery()
1134
1135         for fld in fields:
1136             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1137
1138             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1139                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1140
1141         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1142
1143         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1144                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1145
1146     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1147         """
1148         Search for perfect book matches. Just see if the query matches with some author or title,
1149         taking hints into account.
1150         """
1151         fields_to_search = ['authors', 'title']
1152         only_in = None
1153         if hint:
1154             if not hint.should_search_for_book():
1155                 return []
1156             fields_to_search = hint.just_search_in(fields_to_search)
1157             only_in = hint.book_filter()
1158
1159         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1160
1161         books = []
1162         for q in qrys:
1163             top = self.searcher.search(q,
1164                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1165                 max_results)
1166             for found in top.scoreDocs:
1167                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1168         return books
1169
1170     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1171         fields_to_search = ['tags', 'authors', 'title']
1172
1173         only_in = None
1174         if hint:
1175             if not hint.should_search_for_book():
1176                 return []
1177             fields_to_search = hint.just_search_in(fields_to_search)
1178             only_in = hint.book_filter()
1179
1180         tokens = self.get_tokens(searched, field='SIMPLE')
1181
1182         q = BooleanQuery()
1183
1184         for fld in fields_to_search:
1185             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1186                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1187
1188         books = []
1189         top = self.searcher.search(q,
1190                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1191             max_results)
1192         for found in top.scoreDocs:
1193             books.append(SearchResult(self, found, how_found="search_book"))
1194
1195         return books
1196
1197     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1198         """
1199         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1200         some part/fragment of the book.
1201         """
1202         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1203
1204         flt = None
1205         if hint:
1206             flt = hint.part_filter()
1207
1208         books = []
1209         for q in qrys:
1210             top = self.searcher.search(q,
1211                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1212                                                            flt]),
1213                                        max_results)
1214             for found in top.scoreDocs:
1215                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1216
1217         return books
1218
1219     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1220         """
1221         Tries to use search terms to match different fields of book (or its parts).
1222         E.g. one word can be an author survey, another be a part of the title, and the rest
1223         are some words from third chapter.
1224         """
1225         if tokens_cache is None: tokens_cache = {}
1226         books = []
1227         only_in = None
1228
1229         if hint:
1230             only_in = hint.part_filter()
1231
1232         # content only query : themes x content
1233         q = BooleanQuery()
1234
1235         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1236         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1237
1238         # only search in themes when we do not already filter by themes
1239         if hint is None or hint.just_search_in(['themes']) != []:
1240             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1241                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1242
1243         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1244                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1245
1246         topDocs = self.searcher.search(q, only_in, max_results)
1247         for found in topDocs.scoreDocs:
1248             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1249
1250         # query themes/content x author/title/tags
1251         q = BooleanQuery()
1252         in_content = BooleanQuery()
1253         in_meta = BooleanQuery()
1254
1255         for fld in ['themes_pl', 'content']:
1256             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1257
1258         for fld in ['tags', 'authors', 'title']:
1259             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1260
1261         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1262         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1263
1264         topDocs = self.searcher.search(q, only_in, max_results)
1265         for found in topDocs.scoreDocs:
1266             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1267
1268         return books
1269
1270     # def multisearch(self, query, max_results=50):
1271     #     """
1272     #     Search strategy:
1273     #     - (phrase) OR -> content
1274     #                   -> title
1275     #                   -> authors
1276     #     - (keywords)  -> authors
1277     #                   -> motyw
1278     #                   -> tags
1279     #                   -> content
1280     #     """
1281         # queryreader = StringReader(query)
1282         # tokens = self.get_tokens(queryreader)
1283
1284         # top_level = BooleanQuery()
1285         # Should = BooleanClause.Occur.SHOULD
1286
1287         # phrase_level = BooleanQuery()
1288         # phrase_level.setBoost(1.3)
1289
1290         # p_content = self.make_phrase(tokens, joined=True)
1291         # p_title = self.make_phrase(tokens, 'title')
1292         # p_author = self.make_phrase(tokens, 'author')
1293
1294         # phrase_level.add(BooleanClause(p_content, Should))
1295         # phrase_level.add(BooleanClause(p_title, Should))
1296         # phrase_level.add(BooleanClause(p_author, Should))
1297
1298         # kw_level = BooleanQuery()
1299
1300         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1301         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1302         # kw_level.add(j_themes, Should)
1303         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1304         # j_con = self.make_term_query(tokens, joined=True)
1305         # kw_level.add(j_con, Should)
1306
1307         # top_level.add(BooleanClause(phrase_level, Should))
1308         # top_level.add(BooleanClause(kw_level, Should))
1309
1310         # return None
1311
1312     def get_snippets(self, scoreDoc, query, field='content'):
1313         """
1314         Returns a snippet for found scoreDoc.
1315         """
1316         htmlFormatter = SimpleHTMLFormatter()
1317         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1318
1319         stored = self.searcher.doc(scoreDoc.doc)
1320
1321         position = stored.get('snippets_position')
1322         length = stored.get('snippets_length')
1323         if position is None or length is None:
1324             return None
1325         revision = stored.get('snippets_revision')
1326         if revision: revision = int(revision)
1327
1328         # locate content.
1329         book_id = int(stored.get('book_id'))
1330         snippets = Snippets(book_id, revision=revision)
1331
1332         try:
1333             snippets.open()
1334         except IOError, e:
1335             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1336             return []
1337
1338         try:
1339             try:
1340                 text = snippets.get((int(position),
1341                                      int(length)))
1342             finally:
1343                 snippets.close()
1344
1345             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1346             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1347             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1348
1349         except Exception, e:
1350             e2 = e
1351             if hasattr(e, 'getJavaException'):
1352                 e2 = unicode(e.getJavaException())
1353             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1354                 e2)
1355         return snip
1356
1357     @staticmethod
1358     def enum_to_array(enum):
1359         """
1360         Converts a lucene TermEnum to array of Terms, suitable for
1361         addition to queries
1362         """
1363         terms = []
1364
1365         while True:
1366             t = enum.term()
1367             if t:
1368                 terms.append(t)
1369             if not enum.next(): break
1370
1371         if terms:
1372             return JArray('object')(terms, Term)
1373
1374     def search_tags(self, query, filter=None, max_results=40, pdcounter=False):
1375         """
1376         Search for Tag objects using query.
1377         """
1378         if not pdcounter:
1379             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1380         tops = self.searcher.search(query, filter, max_results)
1381
1382         tags = []
1383         for found in tops.scoreDocs:
1384             doc = self.searcher.doc(found.doc)
1385             is_pdcounter = doc.get('is_pdcounter')
1386             category = doc.get('tag_category')
1387             try:
1388                 if is_pdcounter == 'true':
1389                     if category == 'pd_author':
1390                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1391                     elif category == 'pd_book':
1392                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1393                         tag.category = 'pd_book'  # make it look more lik a tag.
1394                     else:
1395                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1396                 else:
1397                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1398                     # don't add the pdcounter tag if same tag already exists
1399                 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1400                     tags.append(tag)
1401             except catalogue.models.Tag.DoesNotExist: pass
1402             except PDCounterAuthor.DoesNotExist: pass
1403             except PDCounterBook.DoesNotExist: pass
1404
1405         log.debug('search_tags: %s' % tags)
1406
1407         return tags
1408
1409     def search_books(self, query, filter=None, max_results=10):
1410         """
1411         Searches for Book objects using query
1412         """
1413         bks = []
1414         tops = self.searcher.search(query, filter, max_results)
1415         for found in tops.scoreDocs:
1416             doc = self.searcher.doc(found.doc)
1417             try:
1418                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1419             except catalogue.models.Book.DoesNotExist: pass
1420         return bks
1421
1422     def make_prefix_phrase(self, toks, field):
1423         q = MultiPhraseQuery()
1424         for i in range(len(toks)):
1425             t = Term(field, toks[i])
1426             if i == len(toks) - 1:
1427                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1428                 if pterms:
1429                     q.add(pterms)
1430                 else:
1431                     q.add(t)
1432             else:
1433                 q.add(t)
1434         return q
1435
1436     @staticmethod
1437     def term_filter(term, inverse=False):
1438         only_term = TermsFilter()
1439         only_term.addTerm(term)
1440
1441         if inverse:
1442             neg = BooleanFilter()
1443             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1444             only_term = neg
1445
1446         return only_term
1447
1448     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1449         """
1450         Return auto-complete hints for tags
1451         using prefix search.
1452         """
1453         toks = self.get_tokens(string, field='SIMPLE')
1454         top = BooleanQuery()
1455
1456         for field in ['tag_name', 'tag_name_pl']:
1457             if prefix:
1458                 q = self.make_prefix_phrase(toks, field)
1459             else:
1460                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1461             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1462
1463         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1464
1465         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1466
1467     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1468         """
1469         Returns auto-complete hints for book titles
1470         Because we do not index 'pseudo' title-tags.
1471         Prefix search.
1472         """
1473         toks = self.get_tokens(string, field='SIMPLE')
1474
1475         if prefix:
1476             q = self.make_prefix_phrase(toks, 'title')
1477         else:
1478             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1479
1480         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1481
1482     @staticmethod
1483     def chain_filters(filters, op=ChainedFilter.AND):
1484         """
1485         Chains a filter list together
1486         """
1487         filters = filter(lambda x: x is not None, filters)
1488         if not filters or filters is []:
1489             return None
1490         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1491         return chf
1492
1493     def filtered_categories(self, tags):
1494         """
1495         Return a list of tag categories, present in tags list.
1496         """
1497         cats = {}
1498         for t in tags:
1499             cats[t.category] = True
1500         return cats.keys()
1501
1502     def hint(self):
1503         return Hint(self)