apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from django.dispatch import Signal
   5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
   6     File, Field, Integer, \
   7     NumericField, Version, Document, JavaError, IndexSearcher, \
   8     QueryParser, PerFieldAnalyzerWrapper, \
   9     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
  10     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  11     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  12     HashSet, BooleanClause, Term, CharTermAttribute, \
  13     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  14     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  15     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  16     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  17     initVM, CLASSPATH, JArray, JavaError
  18     # KeywordAnalyzer
  19
  20 # Initialize jvm
  21 JVM = initVM(CLASSPATH)
  22
  23 import sys
  24 import os
  25 import re
  26 import errno
  27 from librarian import dcparser
  28 from librarian.parser import WLDocument
  29 from lxml import etree
  30 import catalogue.models
  31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  32 from multiprocessing.pool import ThreadPool
  33 from threading import current_thread
  34 from itertools import chain
  35 import atexit
  36 import traceback
  37 import logging
  38 log = logging.getLogger('search')
  39
  40 class WLAnalyzer(PerFieldAnalyzerWrapper):
  41     def __init__(self):
  42         polish = PolishAnalyzer(Version.LUCENE_34)
  43         #        polish_gap.setPositionIncrementGap(999)
  44
  45         simple = SimpleAnalyzer(Version.LUCENE_34)
  46         #        simple_gap.setPositionIncrementGap(999)
  47
  48         keyword = KeywordAnalyzer(Version.LUCENE_34)
  49
  50         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  51
  52         PerFieldAnalyzerWrapper.__init__(self, polish)
  53
  54         self.addAnalyzer("tags", simple)
  55         self.addAnalyzer("technical_editors", simple)
  56         self.addAnalyzer("editors", simple)
  57         self.addAnalyzer("url", keyword)
  58         self.addAnalyzer("source_url", keyword)
  59         self.addAnalyzer("source_name", simple)
  60         self.addAnalyzer("publisher", simple)
  61         self.addAnalyzer("authors", simple)
  62         self.addAnalyzer("title", simple)
  63
  64         self.addAnalyzer("is_book", keyword)
  65         # shouldn't the title have two forms? _pl and simple?
  66
  67         self.addAnalyzer("themes", simple)
  68         self.addAnalyzer("themes_pl", polish)
  69
  70         self.addAnalyzer("tag_name", simple)
  71         self.addAnalyzer("tag_name_pl", polish)
  72
  73         self.addAnalyzer("translators", simple)
  74
  75         self.addAnalyzer("KEYWORD", keyword)
  76         self.addAnalyzer("SIMPLE", simple)
  77         self.addAnalyzer("POLISH", polish)
  78
  79
  80 class IndexStore(object):
  81     """
  82     Provides access to search index.
  83
  84     self.store - lucene index directory
  85     """
  86     def __init__(self):
  87         self.make_index_dir()
  88         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
  89
  90     def make_index_dir(self):
  91         try:
  92             os.makedirs(settings.SEARCH_INDEX)
  93         except OSError as exc:
  94             if exc.errno == errno.EEXIST:
  95                 pass
  96             else: raise
  97
  98     def close(self):
  99         self.store.close()
 100
 101
 102 class IndexChecker(IndexStore):
 103     def __init__(self):
 104         IndexStore.__init__(self)
 105
 106     def check(self):
 107         checker = CheckIndex(self.store)
 108         status = checker.checkIndex()
 109         return status
 110
 111
 112 class Snippets(object):
 113     """
 114     This class manages snippet files for indexed object (book)
 115     the snippets are concatenated together, and their positions and
 116     lengths are kept in lucene index fields.
 117     """
 118     SNIPPET_DIR = "snippets"
 119
 120     def __init__(self, book_id, revision=None):
 121         try:
 122             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 123         except OSError as exc:
 124             if exc.errno == errno.EEXIST:
 125                 pass
 126             else: raise
 127         self.book_id = book_id
 128         self.revision = revision
 129         self.file = None
 130
 131     @property
 132     def path(self):
 133         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
 134         else: fn = "%d" % self.book_id
 135
 136         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 137
 138     def open(self, mode='r'):
 139         """
 140         Open the snippet file. Call .close() afterwards.
 141         """
 142         if not 'b' in mode:
 143             mode += 'b'
 144
 145         if 'w' in mode:
 146             if os.path.exists(self.path):
 147                 self.revision = 1
 148                 while True:
 149                     if not os.path.exists(self.path):
 150                         break
 151                     self.revision += 1
 152
 153         self.file = open(self.path, mode)
 154         self.position = 0
 155         return self
 156
 157     def add(self, snippet):
 158         """
 159         Append a snippet (unicode) to the snippet file.
 160         Return a (position, length) tuple
 161         """
 162         txt = snippet.encode('utf-8')
 163         l = len(txt)
 164         self.file.write(txt)
 165         pos = (self.position, l)
 166         self.position += l
 167         return pos
 168
 169     def get(self, pos):
 170         """
 171         Given a tuple of (position, length) return an unicode
 172         of the snippet stored there.
 173         """
 174         self.file.seek(pos[0], 0)
 175         txt = self.file.read(pos[1]).decode('utf-8')
 176         return txt
 177
 178     def close(self):
 179         """Close snippet file"""
 180         self.file.close()
 181
 182     def remove(self):
 183         self.revision = None
 184         try:
 185             os.unlink(self.path)
 186             self.revision = 0
 187             while True:
 188                 self.revision += 1
 189                 os.unlink(self.path)
 190         except OSError:
 191             pass
 192
 193
 194 class BaseIndex(IndexStore):
 195     """
 196     Base index class.
 197     Provides basic operations on index: opening, closing, optimizing.
 198     """
 199     def __init__(self, analyzer=None):
 200         super(BaseIndex, self).__init__()
 201         self.index = None
 202         if not analyzer:
 203             analyzer = WLAnalyzer()
 204         self.analyzer = analyzer
 205
 206     def open(self, timeout=None):
 207         if self.index:
 208             raise Exception("Index is already opened")
 209         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 210         if timeout:
 211             conf.setWriteLockTimeout(long(timeout))
 212         self.index = IndexWriter(self.store, conf)
 213         return self.index
 214
 215     def optimize(self):
 216         self.index.optimize()
 217
 218     def close(self):
 219         try:
 220             self.index.optimize()
 221         except JavaError, je:
 222             log.error("Error during optimize phase, check index: %s" % je)
 223
 224         self.index.close()
 225         self.index = None
 226
 227         index_changed.send_robust(self)
 228
 229         super(BaseIndex, self).close()
 230
 231     def __enter__(self):
 232         self.open()
 233         return self
 234
 235     def __exit__(self, type, value, tb):
 236         self.close()
 237
 238
 239 index_changed = Signal()
 240
 241
 242 class Index(BaseIndex):
 243     """
 244     Class indexing books.
 245     """
 246     def __init__(self, analyzer=None):
 247         super(Index, self).__init__(analyzer)
 248
 249     def index_tags(self, *tags, **kw):
 250         """
 251         Re-index global tag list.
 252         Removes all tags from index, then index them again.
 253         Indexed fields include: id, name (with and without polish stems), category
 254         """
 255         remove_only = kw.get('remove_only', False)
 256         # first, remove tags from index.
 257         if tags:
 258             q = BooleanQuery()
 259             for tag in tags:
 260                 b_id_cat = BooleanQuery()
 261
 262                 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
 263                 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
 264
 265                 if isinstance(tag, PDCounterAuthor):
 266                     q_cat = TermQuery(Term('tag_category', 'pd_author'))
 267                 elif isinstance(tag, PDCounterBook):
 268                     q_cat = TermQuery(Term('tag_category', 'pd_book'))
 269                 else:
 270                     q_cat = TermQuery(Term('tag_category', tag.category))
 271                 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
 272
 273                 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
 274         else:  # all
 275             q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 276             self.index.deleteDocuments(q)
 277
 278         if not remove_only:
 279             # then add them [all or just one passed]
 280             if not tags:
 281                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 282                     PDCounterAuthor.objects.all(), \
 283                     PDCounterBook.objects.all())
 284
 285             for tag in tags:
 286                 if isinstance(tag, PDCounterAuthor):
 287                     doc = Document()
 288                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 289                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 290                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 291                     doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 292                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 293                     self.index.addDocument(doc)
 294                 elif isinstance(tag, PDCounterBook):
 295                     doc = Document()
 296                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 297                     doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 298                     doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 299                     doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 300                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 301                     self.index.addDocument(doc)
 302                 else:
 303                     doc = Document()
 304                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 305                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 306                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 307                     doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 308                     self.index.addDocument(doc)
 309
 310     def create_book_doc(self, book):
 311         """
 312         Create a lucene document referring book id.
 313         """
 314         doc = Document()
 315         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 316         if book.parent is not None:
 317             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 318         return doc
 319
 320     def remove_book(self, book, remove_snippets=True):
 321         """Removes a book from search index.
 322         book - Book instance."""
 323         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 324         self.index.deleteDocuments(q)
 325
 326         if remove_snippets:
 327             snippets = Snippets(book.id)
 328             snippets.remove()
 329
 330     def index_book(self, book, book_info=None, overwrite=True):
 331         """
 332         Indexes the book.
 333         Creates a lucene document for extracted metadata
 334         and calls self.index_content() to index the contents of the book.
 335         """
 336         if overwrite:
 337             # we don't remove snippets, since they might be still needed by
 338             # threads using not reopened index
 339             self.remove_book(book, remove_snippets=False)
 340
 341         book_doc = self.create_book_doc(book)
 342         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 343         # let's not index it - it's only used for extracting publish date
 344         del meta_fields['source_name']
 345
 346         for f in meta_fields.values():
 347             if isinstance(f, list) or isinstance(f, tuple):
 348                 for elem in f:
 349                     book_doc.add(elem)
 350             else:
 351                 book_doc.add(f)
 352         self.index.addDocument(book_doc)
 353         del book_doc
 354
 355         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 356
 357     master_tags = [
 358         'opowiadanie',
 359         'powiesc',
 360         'dramat_wierszowany_l',
 361         'dramat_wierszowany_lp',
 362         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 363         'wywiad',
 364         ]
 365
 366     ignore_content_tags = [
 367         'uwaga', 'extra',
 368         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 369         'didaskalia',
 370         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 371         ]
 372
 373     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 374
 375     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 376
 377     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 378
 379     def extract_metadata(self, book, book_info=None, dc_only=None):
 380         """
 381         Extract metadata from book and returns a map of fields keyed by fieldname
 382         """
 383         fields = {}
 384
 385         if book_info is None:
 386             book_info = dcparser.parse(open(book.xml_file.path))
 387
 388         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 389         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 390         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 391
 392         # validator, name
 393         for field in dcparser.BookInfo.FIELDS:
 394             if dc_only and field.name not in dc_only:
 395                 continue
 396             if hasattr(book_info, field.name):
 397                 if not getattr(book_info, field.name):
 398                     continue
 399                 # since no type information is available, we use validator
 400                 type_indicator = field.validator
 401                 if type_indicator == dcparser.as_unicode:
 402                     s = getattr(book_info, field.name)
 403                     if field.multiple:
 404                         s = ', '.join(s)
 405                     try:
 406                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 407                     except JavaError as je:
 408                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 409                 elif type_indicator == dcparser.as_person:
 410                     p = getattr(book_info, field.name)
 411                     if isinstance(p, dcparser.Person):
 412                         persons = unicode(p)
 413                     else:
 414                         persons = ', '.join(map(unicode, p))
 415                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 416                 elif type_indicator == dcparser.as_date:
 417                     dt = getattr(book_info, field.name)
 418                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 419                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 420
 421         # get published date
 422         pd = None
 423         if hasattr(book_info, 'source_name') and book_info.source_name:
 424             match = self.published_date_re.search(book_info.source_name)
 425             if match is not None:
 426                 pd = str(match.groups()[0])
 427         if not pd: pd = ""
 428         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 429
 430         return fields
 431
 432     def add_gaps(self, fields, fieldname):
 433         """
 434         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 435         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 436         """
 437         def gap():
 438             while True:
 439                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 440         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 441
 442     def get_master(self, root):
 443         """
 444         Returns the first master tag from an etree.
 445         """
 446         for master in root.iter():
 447             if master.tag in self.master_tags:
 448                 return master
 449
 450     def index_content(self, book, book_fields=[]):
 451         """
 452         Walks the book XML and extract content from it.
 453         Adds parts for each header tag and for each fragment.
 454         """
 455         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 456         root = wld.edoc.getroot()
 457
 458         master = self.get_master(root)
 459         if master is None:
 460             return []
 461
 462         def walker(node, ignore_tags=[]):
 463
 464             if node.tag not in ignore_tags:
 465                 yield node, None, None
 466                 if node.text is not None:
 467                     yield None, node.text, None
 468                 for child in list(node):
 469                     for b, t, e in walker(child):
 470                         yield b, t, e
 471                 yield None, None, node
 472
 473             if node.tail is not None:
 474                 yield None, node.tail, None
 475             return
 476
 477         def fix_format(text):
 478             #            separator = [u" ", u"\t", u".", u";", u","]
 479             if isinstance(text, list):
 480                 # need to join it first
 481                 text = filter(lambda s: s is not None, content)
 482                 text = u' '.join(text)
 483                 # for i in range(len(text)):
 484                 #     if i > 0:
 485                 #         if text[i][0] not in separator\
 486                 #             and text[i - 1][-1] not in separator:
 487                 #          text.insert(i, u" ")
 488
 489             return re.sub("(?m)/$", "", text)
 490
 491         def add_part(snippets, **fields):
 492             doc = self.create_book_doc(book)
 493             for f in book_fields:
 494                 doc.add(f)
 495
 496             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 497             doc.add(NumericField("header_span", Field.Store.YES, True)\
 498                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 499             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 500
 501             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 502                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 503
 504             snip_pos = snippets.add(fields["content"])
 505             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 506             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 507             if snippets.revision:
 508                 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
 509
 510             if 'fragment_anchor' in fields:
 511                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 512                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 513
 514             if 'themes' in fields:
 515                 themes, themes_pl = zip(*[
 516                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 517                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 518                      for theme in fields['themes']])
 519
 520                 themes = self.add_gaps(themes, 'themes')
 521                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 522
 523                 for t in themes:
 524                     doc.add(t)
 525                 for t in themes_pl:
 526                     doc.add(t)
 527
 528             return doc
 529
 530         def give_me_utf8(s):
 531             if isinstance(s, unicode):
 532                 return s.encode('utf-8')
 533             else:
 534                 return s
 535
 536         fragments = {}
 537         snippets = Snippets(book.id).open('w')
 538         try:
 539             for header, position in zip(list(master), range(len(master))):
 540
 541                 if header.tag in self.skip_header_tags:
 542                     continue
 543                 if header.tag is etree.Comment:
 544                     continue
 545
 546                 # section content
 547                 content = []
 548                 footnote = []
 549
 550                 def all_content(text):
 551                     for frag in fragments.values():
 552                         frag['content'].append(text)
 553                     content.append(text)
 554                 handle_text = [all_content]
 555
 556
 557                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 558                     # handle footnotes
 559                     if start is not None and start.tag in self.footnote_tags:
 560                         footnote = []
 561                         def collect_footnote(t):
 562                             footnote.append(t)
 563                         handle_text.append(collect_footnote)
 564                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 565                         handle_text.pop()
 566                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 567                                        content=u''.join(footnote),
 568                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 569
 570                         self.index.addDocument(doc)
 571                         #print "@ footnote text: %s" % footnote
 572                         footnote = []
 573
 574                     # handle fragments and themes.
 575                     if start is not None and start.tag == 'begin':
 576                         fid = start.attrib['id'][1:]
 577                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 578
 579                     # themes for this fragment
 580                     elif start is not None and start.tag == 'motyw':
 581                         fid = start.attrib['id'][1:]
 582                         handle_text.append(None)
 583                         if start.text is not None:
 584                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 585                     elif end is not None and end.tag == 'motyw':
 586                         handle_text.pop()
 587
 588                     elif start is not None and start.tag == 'end':
 589                         fid = start.attrib['id'][1:]
 590                         if fid not in fragments:
 591                             continue  # a broken <end> node, skip it
 592                         frag = fragments[fid]
 593                         if frag['themes'] == []:
 594                             continue  # empty themes list.
 595                         del fragments[fid]
 596
 597                         doc = add_part(snippets,
 598                                        header_type=frag['start_header'],
 599                                        header_index=frag['start_section'],
 600                                        header_span=position - frag['start_section'] + 1,
 601                                        fragment_anchor=fid,
 602                                        content=fix_format(frag['content']),
 603                                        themes=frag['themes'])
 604                         #print '@ FRAG %s' % frag['content']
 605                         self.index.addDocument(doc)
 606
 607                         # Collect content.
 608
 609                     if text is not None and handle_text is not []:
 610                         hdl = handle_text[-1]
 611                         if hdl is not None:
 612                             hdl(text)
 613
 614                         # in the end, add a section text.
 615                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 616                                content=fix_format(content))
 617                 #print '@ CONTENT: %s' % fix_format(content)
 618
 619                 self.index.addDocument(doc)
 620
 621         finally:
 622             snippets.close()
 623
 624
 625 def log_exception_wrapper(f):
 626     def _wrap(*a):
 627         try:
 628             f(*a)
 629         except Exception, e:
 630             log.error("Error in indexing thread: %s" % e)
 631             traceback.print_exc()
 632             raise e
 633     return _wrap
 634
 635
 636 class ReusableIndex(Index):
 637     """
 638     Works like index, but does not close/optimize Lucene index
 639     until program exit (uses atexit hook).
 640     This is usefull for importbooks command.
 641
 642     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 643     """
 644     index = None
 645
 646     def open(self, analyzer=None, **kw):
 647         if ReusableIndex.index:
 648             self.index = ReusableIndex.index
 649         else:
 650             Index.open(self, analyzer, **kw)
 651             ReusableIndex.index = self.index
 652             atexit.register(ReusableIndex.close_reusable)
 653
 654     # def index_book(self, *args, **kw):
 655     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 656     #     ReusableIndex.pool_jobs.append(job)
 657
 658     @staticmethod
 659     def close_reusable():
 660         if ReusableIndex.index:
 661             ReusableIndex.index.optimize()
 662             ReusableIndex.index.close()
 663             ReusableIndex.index = None
 664
 665             index_changed.send_robust(None)
 666
 667     def close(self):
 668         if ReusableIndex.index:
 669             ReusableIndex.index.commit()
 670
 671
 672 class JoinSearch(object):
 673     """
 674     This mixin could be used to handle block join queries.
 675     (currently unused)
 676     """
 677     def __init__(self, *args, **kw):
 678         super(JoinSearch, self).__init__(*args, **kw)
 679
 680     def wrapjoins(self, query, fields=[]):
 681         """
 682         This functions modifies the query in a recursive way,
 683         so Term and Phrase Queries contained, which match
 684         provided fields are wrapped in a BlockJoinQuery,
 685         and so delegated to children documents.
 686         """
 687         if BooleanQuery.instance_(query):
 688             qs = BooleanQuery.cast_(query)
 689             for clause in qs:
 690                 clause = BooleanClause.cast_(clause)
 691                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 692             return qs
 693         else:
 694             termset = HashSet()
 695             query.extractTerms(termset)
 696             for t in termset:
 697                 t = Term.cast_(t)
 698                 if t.field() not in fields:
 699                     return query
 700             return BlockJoinQuery(query, self.parent_filter,
 701                                   BlockJoinQuery.ScoreMode.Total)
 702
 703     def bsearch(self, query, max_results=50):
 704         q = self.query(query)
 705         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 706
 707         tops = self.searcher.search(bjq, max_results)
 708         bks = []
 709         for found in tops.scoreDocs:
 710             doc = self.searcher.doc(found.doc)
 711             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 712         return (bks, tops.totalHits)
 713
 714
 715 class SearchResult(object):
 716     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 717         if tokens_cache is None: tokens_cache = {}
 718
 719         if score:
 720             self._score = score
 721         else:
 722             self._score = scoreDocs.score
 723
 724         self.boost = 1.0
 725
 726         self._hits = []
 727         self._processed_hits = None  # processed hits
 728
 729         stored = search.searcher.doc(scoreDocs.doc)
 730         self.book_id = int(stored.get("book_id"))
 731
 732         pd = stored.get("published_date")
 733         try:
 734             self.published_date = int(pd)
 735         except ValueError:
 736             self.published_date = 0
 737
 738         header_type = stored.get("header_type")
 739         # we have a content hit in some header of fragment
 740         if header_type is not None:
 741             sec = (header_type, int(stored.get("header_index")))
 742             header_span = stored.get('header_span')
 743             header_span = header_span is not None and int(header_span) or 1
 744
 745             fragment = stored.get("fragment_anchor")
 746
 747             if snippets:
 748                 snippets = snippets.replace("/\n", "\n")
 749             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 750
 751             self._hits.append(hit)
 752
 753         self.search = search
 754         self.searched = searched
 755         self.tokens_cache = tokens_cache
 756
 757     @property
 758     def score(self):
 759         return self._score * self.boost
 760
 761     def merge(self, other):
 762         if self.book_id != other.book_id:
 763             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 764         self._hits += other._hits
 765         if other.score > self.score:
 766             self._score = other._score
 767         return self
 768
 769     def get_book(self):
 770         if hasattr(self, '_book'):
 771             return self._book
 772         return catalogue.models.Book.objects.get(id=self.book_id)
 773
 774     book = property(get_book)
 775
 776     @property
 777     def hits(self):
 778         if self._processed_hits is not None:
 779             return self._processed_hits
 780
 781         POSITION = 0
 782         FRAGMENT = 1
 783         POSITION_INDEX = 1
 784         POSITION_SPAN = 2
 785         SCORE = 2
 786         OTHER = 3
 787
 788         # to sections and fragments
 789         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 790
 791         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 792
 793         # sections not covered by fragments
 794         sect = filter(lambda s: 0 == len(filter(
 795             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 796             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 797             frags)), sect)
 798
 799         hits = []
 800
 801         def remove_duplicates(lst, keyfn, compare):
 802             els = {}
 803             for e in lst:
 804                 eif = keyfn(e)
 805                 if eif in els:
 806                     if compare(els[eif], e) >= 1:
 807                         continue
 808                 els[eif] = e
 809             return els.values()
 810
 811         # remove fragments with duplicated fid's and duplicated snippets
 812         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
 813         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
 814                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
 815
 816         # remove duplicate sections
 817         sections = {}
 818
 819         for s in sect:
 820             si = s[POSITION][POSITION_INDEX]
 821             # skip existing
 822             if si in sections:
 823                 if sections[si]['score'] >= s[SCORE]:
 824                     continue
 825
 826             m = {'score': s[SCORE],
 827                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 828                  }
 829             m.update(s[OTHER])
 830             sections[si] = m
 831
 832         hits = sections.values()
 833
 834         for f in frags:
 835             try:
 836                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 837             except catalogue.models.Fragment.DoesNotExist:
 838                 # stale index
 839                 continue
 840
 841             # Figure out if we were searching for a token matching some word in theme name.
 842             themes = frag.tags.filter(category='theme')
 843             themes_hit = []
 844             if self.searched is not None:
 845                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 846                 for theme in themes:
 847                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 848                     for t in tokens:
 849                         if t in name_tokens:
 850                             if not theme in themes_hit:
 851                                 themes_hit.append(theme)
 852                             break
 853
 854             m = {'score': f[SCORE],
 855                  'fragment': frag,
 856                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 857                  'themes': themes,
 858                  'themes_hit': themes_hit
 859                  }
 860             m.update(f[OTHER])
 861             hits.append(m)
 862
 863         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 864
 865         self._processed_hits = hits
 866
 867         return hits
 868
 869     def __unicode__(self):
 870         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 871
 872     @staticmethod
 873     def aggregate(*result_lists):
 874         books = {}
 875         for rl in result_lists:
 876             for r in rl:
 877                 if r.book_id in books:
 878                     books[r.book_id].merge(r)
 879                 else:
 880                     books[r.book_id] = r
 881         return books.values()
 882
 883     def __cmp__(self, other):
 884         c = cmp(self.score, other.score)
 885         if c == 0:
 886             # this is inverted, because earlier date is better
 887             return cmp(other.published_date, self.published_date)
 888         else:
 889             return c
 890
 891
 892 class Hint(object):
 893     """
 894     Given some hint information (information we already know about)
 895     our search target - like author, title (specific book), epoch, genre, kind
 896     we can narrow down search using filters.
 897     """
 898     def __init__(self, search):
 899         """
 900         Accepts a Searcher instance.
 901         """
 902         self.search = search
 903         self.book_tags = {}
 904         self.part_tags = []
 905         self._books = []
 906
 907     def books(self, *books):
 908         """
 909         Give a hint that we search these books.
 910         """
 911         self._books = books
 912
 913     def tags(self, tags):
 914         """
 915         Give a hint that these Tag objects (a list of)
 916         is necessary.
 917         """
 918         for t in tags:
 919             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 920                 lst = self.book_tags.get(t.category, [])
 921                 lst.append(t)
 922                 self.book_tags[t.category] = lst
 923             if t.category in ['theme', 'theme_pl']:
 924                 self.part_tags.append(t)
 925
 926     def tag_filter(self, tags, field='tags'):
 927         """
 928         Given a lsit of tags and an optional field (but they are normally in tags field)
 929         returns a filter accepting only books with specific tags.
 930         """
 931         q = BooleanQuery()
 932
 933         for tag in tags:
 934             toks = self.search.get_tokens(tag.name, field=field)
 935             tag_phrase = PhraseQuery()
 936             for tok in toks:
 937                 tag_phrase.add(Term(field, tok))
 938             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 939
 940         return QueryWrapperFilter(q)
 941
 942     def book_filter(self):
 943         """
 944         Filters using book tags (all tag kinds except a theme)
 945         """
 946         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 947         if tags:
 948             return self.tag_filter(tags)
 949         else:
 950             return None
 951
 952     def part_filter(self):
 953         """
 954         This filter can be used to look for book parts.
 955         It filters on book id and/or themes.
 956         """
 957         fs = []
 958         if self.part_tags:
 959             fs.append(self.tag_filter(self.part_tags, field='themes'))
 960
 961         if self._books != []:
 962             bf = BooleanFilter()
 963             for b in self._books:
 964                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 965                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 966             fs.append(bf)
 967
 968         return Search.chain_filters(fs)
 969
 970     def should_search_for_book(self):
 971         return self._books == []
 972
 973     def just_search_in(self, all):
 974         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 975         some = []
 976         for field in all:
 977             if field == 'authors' and 'author' in self.book_tags:
 978                 continue
 979             if field == 'title' and self._books != []:
 980                 continue
 981             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 982                 continue
 983             some.append(field)
 984         return some
 985
 986
 987 class Search(IndexStore):
 988     """
 989     Search facilities.
 990     """
 991     def __init__(self, default_field="content"):
 992         IndexStore.__init__(self)
 993         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 994         # self.analyzer = WLAnalyzer()
 995         reader = IndexReader.open(self.store, True)
 996         self.searcher = IndexSearcher(reader)
 997         self.parser = QueryParser(Version.LUCENE_34, default_field,
 998                                   self.analyzer)
 999
1000         self.parent_filter = TermsFilter()
1001         self.parent_filter.addTerm(Term("is_book", "true"))
1002         index_changed.connect(self.reopen)
1003
1004     def close(self):
1005         reader = self.searcher.getIndexReader()
1006         self.searcher.close()
1007         reader.close()
1008         super(Search, self).close()
1009         index_changed.disconnect(self.reopen)
1010
1011     def reopen(self, **unused):
1012         reader = self.searcher.getIndexReader()
1013         rdr = reader.reopen()
1014         if not rdr.equals(reader):
1015             log.debug('Reopening index')
1016             oldsearch = self.searcher
1017             self.searcher = IndexSearcher(rdr)
1018             oldsearch.close()
1019             reader.close()
1020
1021     def query(self, query):
1022         """Parse query in default Lucene Syntax. (for humans)
1023         """
1024         return self.parser.parse(query)
1025
1026     def simple_search(self, query, max_results=50):
1027         """Runs a query for books using lucene syntax. (for humans)
1028         Returns (books, total_hits)
1029         """
1030
1031         tops = self.searcher.search(self.query(query), max_results)
1032         bks = []
1033         for found in tops.scoreDocs:
1034             doc = self.searcher.doc(found.doc)
1035             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1036         return (bks, tops.totalHits)
1037
1038     def get_tokens(self, searched, field='content', cached=None):
1039         """returns tokens analyzed by a proper (for a field) analyzer
1040         argument can be: StringReader, string/unicode, or tokens. In the last case
1041         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1042         """
1043         if cached is not None and field in cached:
1044             return cached[field]
1045
1046         if isinstance(searched, str) or isinstance(searched, unicode):
1047             searched = StringReader(searched)
1048         elif isinstance(searched, list):
1049             return searched
1050
1051         searched.reset()
1052         tokens = self.analyzer.reusableTokenStream(field, searched)
1053         toks = []
1054         while tokens.incrementToken():
1055             cta = tokens.getAttribute(CharTermAttribute.class_)
1056             toks.append(cta.toString())
1057
1058         if cached is not None:
1059             cached[field] = toks
1060
1061         return toks
1062
1063     def fuzziness(self, fuzzy):
1064         """Helper method to sanitize fuzziness"""
1065         if not fuzzy:
1066             return None
1067         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1068             return fuzzy
1069         else:
1070             return 0.5
1071
1072     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1073         """
1074         Return a PhraseQuery with a series of tokens.
1075         """
1076         if fuzzy:
1077             phrase = MultiPhraseQuery()
1078             for t in tokens:
1079                 term = Term(field, t)
1080                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1081                 fuzzterms = []
1082
1083                 while True:
1084                     ft = fuzzterm.term()
1085                     if ft:
1086                         fuzzterms.append(ft)
1087                     if not fuzzterm.next(): break
1088                 if fuzzterms:
1089                     phrase.add(JArray('object')(fuzzterms, Term))
1090                 else:
1091                     phrase.add(term)
1092         else:
1093             phrase = PhraseQuery()
1094             phrase.setSlop(slop)
1095             for t in tokens:
1096                 term = Term(field, t)
1097                 phrase.add(term)
1098         return phrase
1099
1100     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1101         """
1102         Returns term queries joined by boolean query.
1103         modal - applies to boolean query
1104         fuzzy - should the query by fuzzy.
1105         """
1106         q = BooleanQuery()
1107         for t in tokens:
1108             term = Term(field, t)
1109             if fuzzy:
1110                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1111             else:
1112                 term = TermQuery(term)
1113             q.add(BooleanClause(term, modal))
1114         return q
1115
1116     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1117                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1118         if filters is None: filters = []
1119         if tokens_cache is None: tokens_cache = {}
1120
1121         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1122
1123         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1124         if book:
1125             filters.append(self.term_filter(Term('is_book', 'true')))
1126         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1127
1128         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1129
1130     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1131                     filters=None, tokens_cache=None, boost=None, snippets=True):
1132         if filters is None: filters = []
1133         if tokens_cache is None: tokens_cache = {}
1134
1135         if book:
1136             filters.append(self.term_filter(Term('is_book', 'true')))
1137
1138         query = BooleanQuery()
1139
1140         for fld in fields:
1141             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1142
1143             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1144                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1145
1146         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1147
1148         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1149                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1150
1151     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1152         """
1153         Search for perfect book matches. Just see if the query matches with some author or title,
1154         taking hints into account.
1155         """
1156         fields_to_search = ['authors', 'title']
1157         only_in = None
1158         if hint:
1159             if not hint.should_search_for_book():
1160                 return []
1161             fields_to_search = hint.just_search_in(fields_to_search)
1162             only_in = hint.book_filter()
1163
1164         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1165
1166         books = []
1167         for q in qrys:
1168             top = self.searcher.search(q,
1169                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1170                 max_results)
1171             for found in top.scoreDocs:
1172                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1173         return books
1174
1175     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1176         fields_to_search = ['tags', 'authors', 'title']
1177
1178         only_in = None
1179         if hint:
1180             if not hint.should_search_for_book():
1181                 return []
1182             fields_to_search = hint.just_search_in(fields_to_search)
1183             only_in = hint.book_filter()
1184
1185         tokens = self.get_tokens(searched, field='SIMPLE')
1186
1187         q = BooleanQuery()
1188
1189         for fld in fields_to_search:
1190             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1191                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1192
1193         books = []
1194         top = self.searcher.search(q,
1195                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1196             max_results)
1197         for found in top.scoreDocs:
1198             books.append(SearchResult(self, found, how_found="search_book"))
1199
1200         return books
1201
1202     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1203         """
1204         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1205         some part/fragment of the book.
1206         """
1207         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1208
1209         flt = None
1210         if hint:
1211             flt = hint.part_filter()
1212
1213         books = []
1214         for q in qrys:
1215             top = self.searcher.search(q,
1216                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1217                                                            flt]),
1218                                        max_results)
1219             for found in top.scoreDocs:
1220                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1221
1222         return books
1223
1224     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1225         """
1226         Tries to use search terms to match different fields of book (or its parts).
1227         E.g. one word can be an author survey, another be a part of the title, and the rest
1228         are some words from third chapter.
1229         """
1230         if tokens_cache is None: tokens_cache = {}
1231         books = []
1232         only_in = None
1233
1234         if hint:
1235             only_in = hint.part_filter()
1236
1237         # content only query : themes x content
1238         q = BooleanQuery()
1239
1240         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1241         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1242
1243         # only search in themes when we do not already filter by themes
1244         if hint is None or hint.just_search_in(['themes']) != []:
1245             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1246                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1247
1248         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1249                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1250
1251         topDocs = self.searcher.search(q, only_in, max_results)
1252         for found in topDocs.scoreDocs:
1253             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1254
1255         # query themes/content x author/title/tags
1256         q = BooleanQuery()
1257         in_content = BooleanQuery()
1258         in_meta = BooleanQuery()
1259
1260         for fld in ['themes_pl', 'content']:
1261             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1262
1263         for fld in ['tags', 'authors', 'title']:
1264             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1265
1266         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1267         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1268
1269         topDocs = self.searcher.search(q, only_in, max_results)
1270         for found in topDocs.scoreDocs:
1271             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1272
1273         return books
1274
1275     # def multisearch(self, query, max_results=50):
1276     #     """
1277     #     Search strategy:
1278     #     - (phrase) OR -> content
1279     #                   -> title
1280     #                   -> authors
1281     #     - (keywords)  -> authors
1282     #                   -> motyw
1283     #                   -> tags
1284     #                   -> content
1285     #     """
1286         # queryreader = StringReader(query)
1287         # tokens = self.get_tokens(queryreader)
1288
1289         # top_level = BooleanQuery()
1290         # Should = BooleanClause.Occur.SHOULD
1291
1292         # phrase_level = BooleanQuery()
1293         # phrase_level.setBoost(1.3)
1294
1295         # p_content = self.make_phrase(tokens, joined=True)
1296         # p_title = self.make_phrase(tokens, 'title')
1297         # p_author = self.make_phrase(tokens, 'author')
1298
1299         # phrase_level.add(BooleanClause(p_content, Should))
1300         # phrase_level.add(BooleanClause(p_title, Should))
1301         # phrase_level.add(BooleanClause(p_author, Should))
1302
1303         # kw_level = BooleanQuery()
1304
1305         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1306         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1307         # kw_level.add(j_themes, Should)
1308         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1309         # j_con = self.make_term_query(tokens, joined=True)
1310         # kw_level.add(j_con, Should)
1311
1312         # top_level.add(BooleanClause(phrase_level, Should))
1313         # top_level.add(BooleanClause(kw_level, Should))
1314
1315         # return None
1316
1317     def get_snippets(self, scoreDoc, query, field='content'):
1318         """
1319         Returns a snippet for found scoreDoc.
1320         """
1321         htmlFormatter = SimpleHTMLFormatter()
1322         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1323
1324         stored = self.searcher.doc(scoreDoc.doc)
1325
1326         position = stored.get('snippets_position')
1327         length = stored.get('snippets_length')
1328         if position is None or length is None:
1329             return None
1330         revision = stored.get('snippets_revision')
1331         if revision: revision = int(revision)
1332
1333         # locate content.
1334         book_id = int(stored.get('book_id'))
1335         snippets = Snippets(book_id, revision=revision)
1336
1337         try:
1338             snippets.open()
1339         except IOError, e:
1340             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1341             return []
1342
1343         try:
1344             try:
1345                 text = snippets.get((int(position),
1346                                      int(length)))
1347             finally:
1348                 snippets.close()
1349
1350             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1351             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1352             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1353
1354         except Exception, e:
1355             e2 = e
1356             if hasattr(e, 'getJavaException'):
1357                 e2 = unicode(e.getJavaException())
1358             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1359                 e2)
1360         return snip
1361
1362     @staticmethod
1363     def enum_to_array(enum):
1364         """
1365         Converts a lucene TermEnum to array of Terms, suitable for
1366         addition to queries
1367         """
1368         terms = []
1369
1370         while True:
1371             t = enum.term()
1372             if t:
1373                 terms.append(t)
1374             if not enum.next(): break
1375
1376         if terms:
1377             return JArray('object')(terms, Term)
1378
1379     def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1380         """
1381         Search for Tag objects using query.
1382         """
1383         if not pdcounter:
1384             filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1385         tops = self.searcher.search(query, filt, max_results)
1386
1387         tags = []
1388         for found in tops.scoreDocs:
1389             doc = self.searcher.doc(found.doc)
1390             is_pdcounter = doc.get('is_pdcounter')
1391             category = doc.get('tag_category')
1392             try:
1393                 if is_pdcounter == 'true':
1394                     if category == 'pd_author':
1395                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1396                     elif category == 'pd_book':
1397                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1398                         tag.category = 'pd_book'  # make it look more lik a tag.
1399                     else:
1400                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1401                 else:
1402                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1403                     # don't add the pdcounter tag if same tag already exists
1404                 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1405                     tags.append(tag)
1406             except catalogue.models.Tag.DoesNotExist: pass
1407             except PDCounterAuthor.DoesNotExist: pass
1408             except PDCounterBook.DoesNotExist: pass
1409
1410         log.debug('search_tags: %s' % tags)
1411
1412         return tags
1413
1414     def search_books(self, query, filt=None, max_results=10):
1415         """
1416         Searches for Book objects using query
1417         """
1418         bks = []
1419         tops = self.searcher.search(query, filt, max_results)
1420         for found in tops.scoreDocs:
1421             doc = self.searcher.doc(found.doc)
1422             try:
1423                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1424             except catalogue.models.Book.DoesNotExist: pass
1425         return bks
1426
1427     def make_prefix_phrase(self, toks, field):
1428         q = MultiPhraseQuery()
1429         for i in range(len(toks)):
1430             t = Term(field, toks[i])
1431             if i == len(toks) - 1:
1432                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1433                 if pterms:
1434                     q.add(pterms)
1435                 else:
1436                     q.add(t)
1437             else:
1438                 q.add(t)
1439         return q
1440
1441     @staticmethod
1442     def term_filter(term, inverse=False):
1443         only_term = TermsFilter()
1444         only_term.addTerm(term)
1445
1446         if inverse:
1447             neg = BooleanFilter()
1448             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1449             only_term = neg
1450
1451         return only_term
1452
1453     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1454         """
1455         Return auto-complete hints for tags
1456         using prefix search.
1457         """
1458         toks = self.get_tokens(string, field='SIMPLE')
1459         top = BooleanQuery()
1460
1461         for field in ['tag_name', 'tag_name_pl']:
1462             if prefix:
1463                 q = self.make_prefix_phrase(toks, field)
1464             else:
1465                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1466             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1467
1468         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1469
1470         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1471
1472     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1473         """
1474         Returns auto-complete hints for book titles
1475         Because we do not index 'pseudo' title-tags.
1476         Prefix search.
1477         """
1478         toks = self.get_tokens(string, field='SIMPLE')
1479
1480         if prefix:
1481             q = self.make_prefix_phrase(toks, 'title')
1482         else:
1483             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1484
1485         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1486
1487     @staticmethod
1488     def chain_filters(filters, op=ChainedFilter.AND):
1489         """
1490         Chains a filter list together
1491         """
1492         filters = filter(lambda x: x is not None, filters)
1493         if not filters or filters is []:
1494             return None
1495         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1496         return chf
1497
1498     def filtered_categories(self, tags):
1499         """
1500         Return a list of tag categories, present in tags list.
1501         """
1502         cats = {}
1503         for t in tags:
1504             cats[t.category] = True
1505         return cats.keys()
1506
1507     def hint(self):
1508         return Hint(self)