apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from django.dispatch import Signal
   5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
   6     File, Field, Integer, \
   7     NumericField, Version, Document, JavaError, IndexSearcher, \
   8     QueryParser, PerFieldAnalyzerWrapper, \
   9     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
  10     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  11     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  12     HashSet, BooleanClause, Term, CharTermAttribute, \
  13     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  14     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  15     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  16     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  17     initVM, CLASSPATH, JArray, JavaError
  18     # KeywordAnalyzer
  19
  20 # Initialize jvm
  21 JVM = initVM(CLASSPATH)
  22
  23 import sys
  24 import os
  25 import re
  26 import errno
  27 from librarian import dcparser
  28 from librarian.parser import WLDocument
  29 from lxml import etree
  30 import catalogue.models
  31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  32 from multiprocessing.pool import ThreadPool
  33 from threading import current_thread
  34 from itertools import chain
  35 import atexit
  36 import traceback
  37 import logging
  38 log = logging.getLogger('search')
  39
  40 class WLAnalyzer(PerFieldAnalyzerWrapper):
  41     def __init__(self):
  42         polish = PolishAnalyzer(Version.LUCENE_34)
  43         #        polish_gap.setPositionIncrementGap(999)
  44
  45         simple = SimpleAnalyzer(Version.LUCENE_34)
  46         #        simple_gap.setPositionIncrementGap(999)
  47
  48         keyword = KeywordAnalyzer(Version.LUCENE_34)
  49
  50         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  51
  52         PerFieldAnalyzerWrapper.__init__(self, polish)
  53
  54         self.addAnalyzer("tags", simple)
  55         self.addAnalyzer("technical_editors", simple)
  56         self.addAnalyzer("editors", simple)
  57         self.addAnalyzer("url", keyword)
  58         self.addAnalyzer("source_url", keyword)
  59         self.addAnalyzer("source_name", simple)
  60         self.addAnalyzer("publisher", simple)
  61         self.addAnalyzer("authors", simple)
  62         self.addAnalyzer("title", simple)
  63
  64         self.addAnalyzer("is_book", keyword)
  65         # shouldn't the title have two forms? _pl and simple?
  66
  67         self.addAnalyzer("themes", simple)
  68         self.addAnalyzer("themes_pl", polish)
  69
  70         self.addAnalyzer("tag_name", simple)
  71         self.addAnalyzer("tag_name_pl", polish)
  72
  73         self.addAnalyzer("translators", simple)
  74
  75         self.addAnalyzer("KEYWORD", keyword)
  76         self.addAnalyzer("SIMPLE", simple)
  77         self.addAnalyzer("POLISH", polish)
  78
  79
  80 class IndexStore(object):
  81     """
  82     Provides access to search index.
  83
  84     self.store - lucene index directory
  85     """
  86     def __init__(self):
  87         self.make_index_dir()
  88         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
  89
  90     def make_index_dir(self):
  91         try:
  92             os.makedirs(settings.SEARCH_INDEX)
  93         except OSError as exc:
  94             if exc.errno == errno.EEXIST:
  95                 pass
  96             else: raise
  97
  98     def close(self):
  99         self.store.close()
 100
 101
 102 class IndexChecker(IndexStore):
 103     def __init__(self):
 104         IndexStore.__init__(self)
 105
 106     def check(self):
 107         checker = CheckIndex(self.store)
 108         status = checker.checkIndex()
 109         return status
 110
 111
 112 class Snippets(object):
 113     """
 114     This class manages snippet files for indexed object (book)
 115     the snippets are concatenated together, and their positions and
 116     lengths are kept in lucene index fields.
 117     """
 118     SNIPPET_DIR = "snippets"
 119
 120     def __init__(self, book_id, revision=None):
 121         try:
 122             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 123         except OSError as exc:
 124             if exc.errno == errno.EEXIST:
 125                 pass
 126             else: raise
 127         self.book_id = book_id
 128         self.revision = revision
 129         self.file = None
 130
 131     @property
 132     def path(self):
 133         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
 134         else: fn = "%d" % self.book_id
 135
 136         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 137
 138     def open(self, mode='r'):
 139         """
 140         Open the snippet file. Call .close() afterwards.
 141         """
 142         if not 'b' in mode:
 143             mode += 'b'
 144
 145         if 'w' in mode:
 146             if os.path.exists(self.path):
 147                 self.revision = 1
 148                 while True:
 149                     if not os.path.exists(self.path):
 150                         break
 151                     self.revision += 1
 152
 153         self.file = open(self.path, mode)
 154         self.position = 0
 155         return self
 156
 157     def add(self, snippet):
 158         """
 159         Append a snippet (unicode) to the snippet file.
 160         Return a (position, length) tuple
 161         """
 162         txt = snippet.encode('utf-8')
 163         l = len(txt)
 164         self.file.write(txt)
 165         pos = (self.position, l)
 166         self.position += l
 167         return pos
 168
 169     def get(self, pos):
 170         """
 171         Given a tuple of (position, length) return an unicode
 172         of the snippet stored there.
 173         """
 174         self.file.seek(pos[0], 0)
 175         txt = self.file.read(pos[1]).decode('utf-8')
 176         return txt
 177
 178     def close(self):
 179         """Close snippet file"""
 180         self.file.close()
 181
 182     def remove(self):
 183         self.revision = None
 184         try:
 185             os.unlink(self.path)
 186             self.revision = 0
 187             while True:
 188                 self.revision += 1
 189                 os.unlink(self.path)
 190         except OSError:
 191             pass
 192
 193
 194 class BaseIndex(IndexStore):
 195     """
 196     Base index class.
 197     Provides basic operations on index: opening, closing, optimizing.
 198     """
 199     def __init__(self, analyzer=None):
 200         super(BaseIndex, self).__init__()
 201         self.index = None
 202         if not analyzer:
 203             analyzer = WLAnalyzer()
 204         self.analyzer = analyzer
 205
 206     def open(self, timeout=None):
 207         if self.index:
 208             raise Exception("Index is already opened")
 209         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 210         if timeout:
 211             conf.setWriteLockTimeout(long(timeout))
 212         self.index = IndexWriter(self.store, conf)
 213         return self.index
 214
 215     def optimize(self):
 216         self.index.optimize()
 217
 218     def close(self):
 219         try:
 220             self.index.optimize()
 221         except JavaError, je:
 222             log.error("Error during optimize phase, check index: %s" % je)
 223
 224         self.index.close()
 225         self.index = None
 226
 227         index_changed.send_robust(self)
 228
 229         super(BaseIndex, self).close()
 230
 231     def __enter__(self):
 232         self.open()
 233         return self
 234
 235     def __exit__(self, type, value, tb):
 236         self.close()
 237
 238
 239 index_changed = Signal()
 240
 241
 242 class Index(BaseIndex):
 243     """
 244     Class indexing books.
 245     """
 246     def __init__(self, analyzer=None):
 247         super(Index, self).__init__(analyzer)
 248
 249     def index_tags(self, *tags, **kw):
 250         """
 251         Re-index global tag list.
 252         Removes all tags from index, then index them again.
 253         Indexed fields include: id, name (with and without polish stems), category
 254         """
 255         remove_only = kw.get('remove_only', False)
 256         # first, remove tags from index.
 257         if tags:
 258             q = BooleanQuery()
 259             for tag in tags:
 260                 b_id_cat = BooleanQuery()
 261
 262                 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
 263                 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
 264
 265                 if isinstance(tag, PDCounterAuthor):
 266                     q_cat = TermQuery(Term('tag_category', 'pd_author'))
 267                 elif isinstance(tag, PDCounterBook):
 268                     q_cat = TermQuery(Term('tag_category', 'pd_book'))
 269                 else:
 270                     q_cat = TermQuery(Term('tag_category', tag.category))
 271                 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
 272
 273                 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
 274         else:  # all
 275             q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 276             self.index.deleteDocuments(q)
 277
 278         if not remove_only:
 279             # then add them [all or just one passed]
 280             if not tags:
 281                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 282                     PDCounterAuthor.objects.all(), \
 283                     PDCounterBook.objects.all())
 284
 285             for tag in tags:
 286                 if isinstance(tag, PDCounterAuthor):
 287                     doc = Document()
 288                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 289                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 290                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 291                     doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 292                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 293                     self.index.addDocument(doc)
 294                 elif isinstance(tag, PDCounterBook):
 295                     doc = Document()
 296                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 297                     doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 298                     doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 299                     doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 300                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 301                     self.index.addDocument(doc)
 302                 else:
 303                     doc = Document()
 304                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 305                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 306                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 307                     doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 308                     self.index.addDocument(doc)
 309
 310     def create_book_doc(self, book):
 311         """
 312         Create a lucene document referring book id.
 313         """
 314         doc = Document()
 315         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 316         if book.parent is not None:
 317             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 318         return doc
 319
 320     def remove_book(self, book_or_id, remove_snippets=True):
 321         """Removes a book from search index.
 322         book - Book instance."""
 323         if isinstance(book_or_id, catalogue.models.Book):
 324             book_id = book_or_id.id
 325         else:
 326             book_id = book_or_id
 327
 328         q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
 329         self.index.deleteDocuments(q)
 330
 331         if remove_snippets:
 332             snippets = Snippets(book_id)
 333             snippets.remove()
 334
 335     def index_book(self, book, book_info=None, overwrite=True):
 336         """
 337         Indexes the book.
 338         Creates a lucene document for extracted metadata
 339         and calls self.index_content() to index the contents of the book.
 340         """
 341         if overwrite:
 342             # we don't remove snippets, since they might be still needed by
 343             # threads using not reopened index
 344             self.remove_book(book, remove_snippets=False)
 345
 346         book_doc = self.create_book_doc(book)
 347         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 348         # let's not index it - it's only used for extracting publish date
 349         del meta_fields['source_name']
 350
 351         for f in meta_fields.values():
 352             if isinstance(f, list) or isinstance(f, tuple):
 353                 for elem in f:
 354                     book_doc.add(elem)
 355             else:
 356                 book_doc.add(f)
 357         self.index.addDocument(book_doc)
 358         del book_doc
 359
 360         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 361
 362     master_tags = [
 363         'opowiadanie',
 364         'powiesc',
 365         'dramat_wierszowany_l',
 366         'dramat_wierszowany_lp',
 367         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 368         'wywiad',
 369         ]
 370
 371     ignore_content_tags = [
 372         'uwaga', 'extra',
 373         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 374         'didaskalia',
 375         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 376         ]
 377
 378     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 379
 380     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 381
 382     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 383
 384     def extract_metadata(self, book, book_info=None, dc_only=None):
 385         """
 386         Extract metadata from book and returns a map of fields keyed by fieldname
 387         """
 388         fields = {}
 389
 390         if book_info is None:
 391             book_info = dcparser.parse(open(book.xml_file.path))
 392
 393         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 394         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 395         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 396
 397         # validator, name
 398         for field in dcparser.BookInfo.FIELDS:
 399             if dc_only and field.name not in dc_only:
 400                 continue
 401             if hasattr(book_info, field.name):
 402                 if not getattr(book_info, field.name):
 403                     continue
 404                 # since no type information is available, we use validator
 405                 type_indicator = field.validator
 406                 if type_indicator == dcparser.as_unicode:
 407                     s = getattr(book_info, field.name)
 408                     if field.multiple:
 409                         s = ', '.join(s)
 410                     try:
 411                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 412                     except JavaError as je:
 413                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 414                 elif type_indicator == dcparser.as_person:
 415                     p = getattr(book_info, field.name)
 416                     if isinstance(p, dcparser.Person):
 417                         persons = unicode(p)
 418                     else:
 419                         persons = ', '.join(map(unicode, p))
 420                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 421                 elif type_indicator == dcparser.as_date:
 422                     dt = getattr(book_info, field.name)
 423                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 424                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 425
 426         # get published date
 427         pd = None
 428         if hasattr(book_info, 'source_name') and book_info.source_name:
 429             match = self.published_date_re.search(book_info.source_name)
 430             if match is not None:
 431                 pd = str(match.groups()[0])
 432         if not pd: pd = ""
 433         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 434
 435         return fields
 436
 437     def add_gaps(self, fields, fieldname):
 438         """
 439         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 440         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 441         """
 442         def gap():
 443             while True:
 444                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 445         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 446
 447     def get_master(self, root):
 448         """
 449         Returns the first master tag from an etree.
 450         """
 451         for master in root.iter():
 452             if master.tag in self.master_tags:
 453                 return master
 454
 455     def index_content(self, book, book_fields=[]):
 456         """
 457         Walks the book XML and extract content from it.
 458         Adds parts for each header tag and for each fragment.
 459         """
 460         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 461         root = wld.edoc.getroot()
 462
 463         master = self.get_master(root)
 464         if master is None:
 465             return []
 466
 467         def walker(node, ignore_tags=[]):
 468
 469             if node.tag not in ignore_tags:
 470                 yield node, None, None
 471                 if node.text is not None:
 472                     yield None, node.text, None
 473                 for child in list(node):
 474                     for b, t, e in walker(child):
 475                         yield b, t, e
 476                 yield None, None, node
 477
 478             if node.tail is not None:
 479                 yield None, node.tail, None
 480             return
 481
 482         def fix_format(text):
 483             #            separator = [u" ", u"\t", u".", u";", u","]
 484             if isinstance(text, list):
 485                 # need to join it first
 486                 text = filter(lambda s: s is not None, content)
 487                 text = u' '.join(text)
 488                 # for i in range(len(text)):
 489                 #     if i > 0:
 490                 #         if text[i][0] not in separator\
 491                 #             and text[i - 1][-1] not in separator:
 492                 #          text.insert(i, u" ")
 493
 494             return re.sub("(?m)/$", "", text)
 495
 496         def add_part(snippets, **fields):
 497             doc = self.create_book_doc(book)
 498             for f in book_fields:
 499                 doc.add(f)
 500
 501             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 502             doc.add(NumericField("header_span", Field.Store.YES, True)\
 503                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 504             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 505
 506             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 507                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 508
 509             snip_pos = snippets.add(fields["content"])
 510             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 511             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 512             if snippets.revision:
 513                 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
 514
 515             if 'fragment_anchor' in fields:
 516                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 517                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 518
 519             if 'themes' in fields:
 520                 themes, themes_pl = zip(*[
 521                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 522                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 523                      for theme in fields['themes']])
 524
 525                 themes = self.add_gaps(themes, 'themes')
 526                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 527
 528                 for t in themes:
 529                     doc.add(t)
 530                 for t in themes_pl:
 531                     doc.add(t)
 532
 533             return doc
 534
 535         def give_me_utf8(s):
 536             if isinstance(s, unicode):
 537                 return s.encode('utf-8')
 538             else:
 539                 return s
 540
 541         fragments = {}
 542         snippets = Snippets(book.id).open('w')
 543         try:
 544             for header, position in zip(list(master), range(len(master))):
 545
 546                 if header.tag in self.skip_header_tags:
 547                     continue
 548                 if header.tag is etree.Comment:
 549                     continue
 550
 551                 # section content
 552                 content = []
 553                 footnote = []
 554
 555                 def all_content(text):
 556                     for frag in fragments.values():
 557                         frag['content'].append(text)
 558                     content.append(text)
 559                 handle_text = [all_content]
 560
 561
 562                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 563                     # handle footnotes
 564                     if start is not None and start.tag in self.footnote_tags:
 565                         footnote = []
 566                         def collect_footnote(t):
 567                             footnote.append(t)
 568                         handle_text.append(collect_footnote)
 569                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 570                         handle_text.pop()
 571                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 572                                        content=u''.join(footnote),
 573                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 574
 575                         self.index.addDocument(doc)
 576                         #print "@ footnote text: %s" % footnote
 577                         footnote = []
 578
 579                     # handle fragments and themes.
 580                     if start is not None and start.tag == 'begin':
 581                         fid = start.attrib['id'][1:]
 582                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 583
 584                     # themes for this fragment
 585                     elif start is not None and start.tag == 'motyw':
 586                         fid = start.attrib['id'][1:]
 587                         handle_text.append(None)
 588                         if start.text is not None:
 589                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 590                     elif end is not None and end.tag == 'motyw':
 591                         handle_text.pop()
 592
 593                     elif start is not None and start.tag == 'end':
 594                         fid = start.attrib['id'][1:]
 595                         if fid not in fragments:
 596                             continue  # a broken <end> node, skip it
 597                         frag = fragments[fid]
 598                         if frag['themes'] == []:
 599                             continue  # empty themes list.
 600                         del fragments[fid]
 601
 602                         doc = add_part(snippets,
 603                                        header_type=frag['start_header'],
 604                                        header_index=frag['start_section'],
 605                                        header_span=position - frag['start_section'] + 1,
 606                                        fragment_anchor=fid,
 607                                        content=fix_format(frag['content']),
 608                                        themes=frag['themes'])
 609                         #print '@ FRAG %s' % frag['content']
 610                         self.index.addDocument(doc)
 611
 612                         # Collect content.
 613
 614                     if text is not None and handle_text is not []:
 615                         hdl = handle_text[-1]
 616                         if hdl is not None:
 617                             hdl(text)
 618
 619                         # in the end, add a section text.
 620                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 621                                content=fix_format(content))
 622                 #print '@ CONTENT: %s' % fix_format(content)
 623
 624                 self.index.addDocument(doc)
 625
 626         finally:
 627             snippets.close()
 628
 629
 630 def log_exception_wrapper(f):
 631     def _wrap(*a):
 632         try:
 633             f(*a)
 634         except Exception, e:
 635             log.error("Error in indexing thread: %s" % e)
 636             traceback.print_exc()
 637             raise e
 638     return _wrap
 639
 640
 641 class ReusableIndex(Index):
 642     """
 643     Works like index, but does not close/optimize Lucene index
 644     until program exit (uses atexit hook).
 645     This is usefull for importbooks command.
 646
 647     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 648     """
 649     index = None
 650
 651     def open(self, analyzer=None, **kw):
 652         if ReusableIndex.index:
 653             self.index = ReusableIndex.index
 654         else:
 655             Index.open(self, analyzer, **kw)
 656             ReusableIndex.index = self.index
 657             atexit.register(ReusableIndex.close_reusable)
 658
 659     # def index_book(self, *args, **kw):
 660     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 661     #     ReusableIndex.pool_jobs.append(job)
 662
 663     @staticmethod
 664     def close_reusable():
 665         if ReusableIndex.index:
 666             ReusableIndex.index.optimize()
 667             ReusableIndex.index.close()
 668             ReusableIndex.index = None
 669
 670             index_changed.send_robust(None)
 671
 672     def close(self):
 673         if ReusableIndex.index:
 674             ReusableIndex.index.commit()
 675
 676
 677 class JoinSearch(object):
 678     """
 679     This mixin could be used to handle block join queries.
 680     (currently unused)
 681     """
 682     def __init__(self, *args, **kw):
 683         super(JoinSearch, self).__init__(*args, **kw)
 684
 685     def wrapjoins(self, query, fields=[]):
 686         """
 687         This functions modifies the query in a recursive way,
 688         so Term and Phrase Queries contained, which match
 689         provided fields are wrapped in a BlockJoinQuery,
 690         and so delegated to children documents.
 691         """
 692         if BooleanQuery.instance_(query):
 693             qs = BooleanQuery.cast_(query)
 694             for clause in qs:
 695                 clause = BooleanClause.cast_(clause)
 696                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 697             return qs
 698         else:
 699             termset = HashSet()
 700             query.extractTerms(termset)
 701             for t in termset:
 702                 t = Term.cast_(t)
 703                 if t.field() not in fields:
 704                     return query
 705             return BlockJoinQuery(query, self.parent_filter,
 706                                   BlockJoinQuery.ScoreMode.Total)
 707
 708     def bsearch(self, query, max_results=50):
 709         q = self.query(query)
 710         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 711
 712         tops = self.searcher.search(bjq, max_results)
 713         bks = []
 714         for found in tops.scoreDocs:
 715             doc = self.searcher.doc(found.doc)
 716             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 717         return (bks, tops.totalHits)
 718
 719
 720 class SearchResult(object):
 721     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 722         if tokens_cache is None: tokens_cache = {}
 723
 724         if score:
 725             self._score = score
 726         else:
 727             self._score = scoreDocs.score
 728
 729         self.boost = 1.0
 730
 731         self._hits = []
 732         self._processed_hits = None  # processed hits
 733
 734         stored = search.searcher.doc(scoreDocs.doc)
 735         self.book_id = int(stored.get("book_id"))
 736
 737         pd = stored.get("published_date")
 738         try:
 739             self.published_date = int(pd)
 740         except ValueError:
 741             self.published_date = 0
 742
 743         header_type = stored.get("header_type")
 744         # we have a content hit in some header of fragment
 745         if header_type is not None:
 746             sec = (header_type, int(stored.get("header_index")))
 747             header_span = stored.get('header_span')
 748             header_span = header_span is not None and int(header_span) or 1
 749
 750             fragment = stored.get("fragment_anchor")
 751
 752             if snippets:
 753                 snippets = snippets.replace("/\n", "\n")
 754             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 755
 756             self._hits.append(hit)
 757
 758         self.search = search
 759         self.searched = searched
 760         self.tokens_cache = tokens_cache
 761
 762     @property
 763     def score(self):
 764         return self._score * self.boost
 765
 766     def merge(self, other):
 767         if self.book_id != other.book_id:
 768             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 769         self._hits += other._hits
 770         if other.score > self.score:
 771             self._score = other._score
 772         return self
 773
 774     def get_book(self):
 775         if hasattr(self, '_book'):
 776             return self._book
 777         return catalogue.models.Book.objects.get(id=self.book_id)
 778
 779     book = property(get_book)
 780
 781     @property
 782     def hits(self):
 783         if self._processed_hits is not None:
 784             return self._processed_hits
 785
 786         POSITION = 0
 787         FRAGMENT = 1
 788         POSITION_INDEX = 1
 789         POSITION_SPAN = 2
 790         SCORE = 2
 791         OTHER = 3
 792
 793         # to sections and fragments
 794         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 795
 796         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 797
 798         # sections not covered by fragments
 799         sect = filter(lambda s: 0 == len(filter(
 800             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 801             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 802             frags)), sect)
 803
 804         hits = []
 805
 806         def remove_duplicates(lst, keyfn, compare):
 807             els = {}
 808             for e in lst:
 809                 eif = keyfn(e)
 810                 if eif in els:
 811                     if compare(els[eif], e) >= 1:
 812                         continue
 813                 els[eif] = e
 814             return els.values()
 815
 816         # remove fragments with duplicated fid's and duplicated snippets
 817         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
 818         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
 819                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
 820
 821         # remove duplicate sections
 822         sections = {}
 823
 824         for s in sect:
 825             si = s[POSITION][POSITION_INDEX]
 826             # skip existing
 827             if si in sections:
 828                 if sections[si]['score'] >= s[SCORE]:
 829                     continue
 830
 831             m = {'score': s[SCORE],
 832                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 833                  }
 834             m.update(s[OTHER])
 835             sections[si] = m
 836
 837         hits = sections.values()
 838
 839         for f in frags:
 840             try:
 841                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 842             except catalogue.models.Fragment.DoesNotExist:
 843                 # stale index
 844                 continue
 845
 846             # Figure out if we were searching for a token matching some word in theme name.
 847             themes = frag.tags.filter(category='theme')
 848             themes_hit = []
 849             if self.searched is not None:
 850                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 851                 for theme in themes:
 852                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 853                     for t in tokens:
 854                         if t in name_tokens:
 855                             if not theme in themes_hit:
 856                                 themes_hit.append(theme)
 857                             break
 858
 859             m = {'score': f[SCORE],
 860                  'fragment': frag,
 861                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 862                  'themes': themes,
 863                  'themes_hit': themes_hit
 864                  }
 865             m.update(f[OTHER])
 866             hits.append(m)
 867
 868         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 869
 870         self._processed_hits = hits
 871
 872         return hits
 873
 874     def __unicode__(self):
 875         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 876
 877     @staticmethod
 878     def aggregate(*result_lists):
 879         books = {}
 880         for rl in result_lists:
 881             for r in rl:
 882                 if r.book_id in books:
 883                     books[r.book_id].merge(r)
 884                 else:
 885                     books[r.book_id] = r
 886         return books.values()
 887
 888     def __cmp__(self, other):
 889         c = cmp(self.score, other.score)
 890         if c == 0:
 891             # this is inverted, because earlier date is better
 892             return cmp(other.published_date, self.published_date)
 893         else:
 894             return c
 895
 896
 897 class Hint(object):
 898     """
 899     Given some hint information (information we already know about)
 900     our search target - like author, title (specific book), epoch, genre, kind
 901     we can narrow down search using filters.
 902     """
 903     def __init__(self, search):
 904         """
 905         Accepts a Searcher instance.
 906         """
 907         self.search = search
 908         self.book_tags = {}
 909         self.part_tags = []
 910         self._books = []
 911
 912     def books(self, *books):
 913         """
 914         Give a hint that we search these books.
 915         """
 916         self._books = books
 917
 918     def tags(self, tags):
 919         """
 920         Give a hint that these Tag objects (a list of)
 921         is necessary.
 922         """
 923         for t in tags:
 924             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 925                 lst = self.book_tags.get(t.category, [])
 926                 lst.append(t)
 927                 self.book_tags[t.category] = lst
 928             if t.category in ['theme', 'theme_pl']:
 929                 self.part_tags.append(t)
 930
 931     def tag_filter(self, tags, field='tags'):
 932         """
 933         Given a lsit of tags and an optional field (but they are normally in tags field)
 934         returns a filter accepting only books with specific tags.
 935         """
 936         q = BooleanQuery()
 937
 938         for tag in tags:
 939             toks = self.search.get_tokens(tag.name, field=field)
 940             tag_phrase = PhraseQuery()
 941             for tok in toks:
 942                 tag_phrase.add(Term(field, tok))
 943             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 944
 945         return QueryWrapperFilter(q)
 946
 947     def book_filter(self):
 948         """
 949         Filters using book tags (all tag kinds except a theme)
 950         """
 951         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 952         if tags:
 953             return self.tag_filter(tags)
 954         else:
 955             return None
 956
 957     def part_filter(self):
 958         """
 959         This filter can be used to look for book parts.
 960         It filters on book id and/or themes.
 961         """
 962         fs = []
 963         if self.part_tags:
 964             fs.append(self.tag_filter(self.part_tags, field='themes'))
 965
 966         if self._books != []:
 967             bf = BooleanFilter()
 968             for b in self._books:
 969                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 970                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 971             fs.append(bf)
 972
 973         return Search.chain_filters(fs)
 974
 975     def should_search_for_book(self):
 976         return self._books == []
 977
 978     def just_search_in(self, all):
 979         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 980         some = []
 981         for field in all:
 982             if field == 'authors' and 'author' in self.book_tags:
 983                 continue
 984             if field == 'title' and self._books != []:
 985                 continue
 986             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 987                 continue
 988             some.append(field)
 989         return some
 990
 991
 992 class Search(IndexStore):
 993     """
 994     Search facilities.
 995     """
 996     def __init__(self, default_field="content"):
 997         IndexStore.__init__(self)
 998         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 999         # self.analyzer = WLAnalyzer()
1000         reader = IndexReader.open(self.store, True)
1001         self.searcher = IndexSearcher(reader)
1002         self.parser = QueryParser(Version.LUCENE_34, default_field,
1003                                   self.analyzer)
1004
1005         self.parent_filter = TermsFilter()
1006         self.parent_filter.addTerm(Term("is_book", "true"))
1007         index_changed.connect(self.reopen)
1008
1009     def close(self):
1010         reader = self.searcher.getIndexReader()
1011         self.searcher.close()
1012         reader.close()
1013         super(Search, self).close()
1014         index_changed.disconnect(self.reopen)
1015
1016     def reopen(self, **unused):
1017         reader = self.searcher.getIndexReader()
1018         rdr = reader.reopen()
1019         if not rdr.equals(reader):
1020             log.debug('Reopening index')
1021             oldsearch = self.searcher
1022             self.searcher = IndexSearcher(rdr)
1023             oldsearch.close()
1024             reader.close()
1025
1026     def query(self, query):
1027         """Parse query in default Lucene Syntax. (for humans)
1028         """
1029         return self.parser.parse(query)
1030
1031     def simple_search(self, query, max_results=50):
1032         """Runs a query for books using lucene syntax. (for humans)
1033         Returns (books, total_hits)
1034         """
1035
1036         tops = self.searcher.search(self.query(query), max_results)
1037         bks = []
1038         for found in tops.scoreDocs:
1039             doc = self.searcher.doc(found.doc)
1040             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1041         return (bks, tops.totalHits)
1042
1043     def get_tokens(self, searched, field='content', cached=None):
1044         """returns tokens analyzed by a proper (for a field) analyzer
1045         argument can be: StringReader, string/unicode, or tokens. In the last case
1046         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1047         """
1048         if cached is not None and field in cached:
1049             return cached[field]
1050
1051         if isinstance(searched, str) or isinstance(searched, unicode):
1052             searched = StringReader(searched)
1053         elif isinstance(searched, list):
1054             return searched
1055
1056         searched.reset()
1057         tokens = self.analyzer.reusableTokenStream(field, searched)
1058         toks = []
1059         while tokens.incrementToken():
1060             cta = tokens.getAttribute(CharTermAttribute.class_)
1061             toks.append(cta.toString())
1062
1063         if cached is not None:
1064             cached[field] = toks
1065
1066         return toks
1067
1068     @staticmethod
1069     def fuzziness(self, fuzzy):
1070         """Helper method to sanitize fuzziness"""
1071         if not fuzzy:
1072             return None
1073         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1074             return fuzzy
1075         else:
1076             return 0.5
1077
1078     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1079         """
1080         Return a PhraseQuery with a series of tokens.
1081         """
1082         if fuzzy:
1083             phrase = MultiPhraseQuery()
1084             for t in tokens:
1085                 term = Term(field, t)
1086                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1087                 fuzzterms = []
1088
1089                 while True:
1090                     ft = fuzzterm.term()
1091                     if ft:
1092                         fuzzterms.append(ft)
1093                     if not fuzzterm.next(): break
1094                 if fuzzterms:
1095                     phrase.add(JArray('object')(fuzzterms, Term))
1096                 else:
1097                     phrase.add(term)
1098         else:
1099             phrase = PhraseQuery()
1100             phrase.setSlop(slop)
1101             for t in tokens:
1102                 term = Term(field, t)
1103                 phrase.add(term)
1104         return phrase
1105
1106     @staticmethod
1107     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1108         """
1109         Returns term queries joined by boolean query.
1110         modal - applies to boolean query
1111         fuzzy - should the query by fuzzy.
1112         """
1113         q = BooleanQuery()
1114         for t in tokens:
1115             term = Term(field, t)
1116             if fuzzy:
1117                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1118             else:
1119                 term = TermQuery(term)
1120             q.add(BooleanClause(term, modal))
1121         return q
1122
1123     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1124                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1125         if filters is None: filters = []
1126         if tokens_cache is None: tokens_cache = {}
1127
1128         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1129
1130         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1131         if book:
1132             filters.append(self.term_filter(Term('is_book', 'true')))
1133         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1134
1135         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1136
1137     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1138                     filters=None, tokens_cache=None, boost=None, snippets=True):
1139         if filters is None: filters = []
1140         if tokens_cache is None: tokens_cache = {}
1141
1142         if book:
1143             filters.append(self.term_filter(Term('is_book', 'true')))
1144
1145         query = BooleanQuery()
1146
1147         for fld in fields:
1148             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1149
1150             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1151                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1152
1153         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1154
1155         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1156                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1157
1158     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1159         """
1160         Search for perfect book matches. Just see if the query matches with some author or title,
1161         taking hints into account.
1162         """
1163         fields_to_search = ['authors', 'title']
1164         only_in = None
1165         if hint:
1166             if not hint.should_search_for_book():
1167                 return []
1168             fields_to_search = hint.just_search_in(fields_to_search)
1169             only_in = hint.book_filter()
1170
1171         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1172
1173         books = []
1174         for q in qrys:
1175             top = self.searcher.search(q,
1176                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1177                 max_results)
1178             for found in top.scoreDocs:
1179                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1180         return books
1181
1182     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1183         fields_to_search = ['tags', 'authors', 'title']
1184
1185         only_in = None
1186         if hint:
1187             if not hint.should_search_for_book():
1188                 return []
1189             fields_to_search = hint.just_search_in(fields_to_search)
1190             only_in = hint.book_filter()
1191
1192         tokens = self.get_tokens(searched, field='SIMPLE')
1193
1194         q = BooleanQuery()
1195
1196         for fld in fields_to_search:
1197             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1198                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1199
1200         books = []
1201         top = self.searcher.search(q,
1202                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1203             max_results)
1204         for found in top.scoreDocs:
1205             books.append(SearchResult(self, found, how_found="search_book"))
1206
1207         return books
1208
1209     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1210         """
1211         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1212         some part/fragment of the book.
1213         """
1214         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1215
1216         flt = None
1217         if hint:
1218             flt = hint.part_filter()
1219
1220         books = []
1221         for q in qrys:
1222             top = self.searcher.search(q,
1223                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1224                                                            flt]),
1225                                        max_results)
1226             for found in top.scoreDocs:
1227                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1228
1229         return books
1230
1231     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1232         """
1233         Tries to use search terms to match different fields of book (or its parts).
1234         E.g. one word can be an author survey, another be a part of the title, and the rest
1235         are some words from third chapter.
1236         """
1237         if tokens_cache is None: tokens_cache = {}
1238         books = []
1239         only_in = None
1240
1241         if hint:
1242             only_in = hint.part_filter()
1243
1244         # content only query : themes x content
1245         q = BooleanQuery()
1246
1247         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1248         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1249
1250         # only search in themes when we do not already filter by themes
1251         if hint is None or hint.just_search_in(['themes']) != []:
1252             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1253                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1254
1255         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1256                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1257
1258         topDocs = self.searcher.search(q, only_in, max_results)
1259         for found in topDocs.scoreDocs:
1260             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1261
1262         # query themes/content x author/title/tags
1263         q = BooleanQuery()
1264         in_content = BooleanQuery()
1265         in_meta = BooleanQuery()
1266
1267         for fld in ['themes_pl', 'content']:
1268             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1269
1270         for fld in ['tags', 'authors', 'title']:
1271             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1272
1273         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1274         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1275
1276         topDocs = self.searcher.search(q, only_in, max_results)
1277         for found in topDocs.scoreDocs:
1278             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1279
1280         return books
1281
1282     # def multisearch(self, query, max_results=50):
1283     #     """
1284     #     Search strategy:
1285     #     - (phrase) OR -> content
1286     #                   -> title
1287     #                   -> authors
1288     #     - (keywords)  -> authors
1289     #                   -> motyw
1290     #                   -> tags
1291     #                   -> content
1292     #     """
1293         # queryreader = StringReader(query)
1294         # tokens = self.get_tokens(queryreader)
1295
1296         # top_level = BooleanQuery()
1297         # Should = BooleanClause.Occur.SHOULD
1298
1299         # phrase_level = BooleanQuery()
1300         # phrase_level.setBoost(1.3)
1301
1302         # p_content = self.make_phrase(tokens, joined=True)
1303         # p_title = self.make_phrase(tokens, 'title')
1304         # p_author = self.make_phrase(tokens, 'author')
1305
1306         # phrase_level.add(BooleanClause(p_content, Should))
1307         # phrase_level.add(BooleanClause(p_title, Should))
1308         # phrase_level.add(BooleanClause(p_author, Should))
1309
1310         # kw_level = BooleanQuery()
1311
1312         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1313         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1314         # kw_level.add(j_themes, Should)
1315         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1316         # j_con = self.make_term_query(tokens, joined=True)
1317         # kw_level.add(j_con, Should)
1318
1319         # top_level.add(BooleanClause(phrase_level, Should))
1320         # top_level.add(BooleanClause(kw_level, Should))
1321
1322         # return None
1323
1324     def get_snippets(self, scoreDoc, query, field='content'):
1325         """
1326         Returns a snippet for found scoreDoc.
1327         """
1328         htmlFormatter = SimpleHTMLFormatter()
1329         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1330
1331         stored = self.searcher.doc(scoreDoc.doc)
1332
1333         position = stored.get('snippets_position')
1334         length = stored.get('snippets_length')
1335         if position is None or length is None:
1336             return None
1337         revision = stored.get('snippets_revision')
1338         if revision: revision = int(revision)
1339
1340         # locate content.
1341         book_id = int(stored.get('book_id'))
1342         snippets = Snippets(book_id, revision=revision)
1343
1344         try:
1345             snippets.open()
1346         except IOError, e:
1347             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1348             return []
1349
1350         try:
1351             try:
1352                 text = snippets.get((int(position),
1353                                      int(length)))
1354             finally:
1355                 snippets.close()
1356
1357             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1358             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1359             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1360
1361         except Exception, e:
1362             e2 = e
1363             if hasattr(e, 'getJavaException'):
1364                 e2 = unicode(e.getJavaException())
1365             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1366                 e2)
1367         return snip
1368
1369     @staticmethod
1370     def enum_to_array(enum):
1371         """
1372         Converts a lucene TermEnum to array of Terms, suitable for
1373         addition to queries
1374         """
1375         terms = []
1376
1377         while True:
1378             t = enum.term()
1379             if t:
1380                 terms.append(t)
1381             if not enum.next(): break
1382
1383         if terms:
1384             return JArray('object')(terms, Term)
1385
1386     def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1387         """
1388         Search for Tag objects using query.
1389         """
1390         if not pdcounter:
1391             filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1392         tops = self.searcher.search(query, filt, max_results)
1393
1394         tags = []
1395         for found in tops.scoreDocs:
1396             doc = self.searcher.doc(found.doc)
1397             is_pdcounter = doc.get('is_pdcounter')
1398             category = doc.get('tag_category')
1399             try:
1400                 if is_pdcounter == 'true':
1401                     if category == 'pd_author':
1402                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1403                     elif category == 'pd_book':
1404                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1405                         tag.category = 'pd_book'  # make it look more lik a tag.
1406                     else:
1407                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1408                 else:
1409                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1410                     # don't add the pdcounter tag if same tag already exists
1411                 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1412                     tags.append(tag)
1413             except catalogue.models.Tag.DoesNotExist: pass
1414             except PDCounterAuthor.DoesNotExist: pass
1415             except PDCounterBook.DoesNotExist: pass
1416
1417         log.debug('search_tags: %s' % tags)
1418
1419         return tags
1420
1421     def search_books(self, query, filt=None, max_results=10):
1422         """
1423         Searches for Book objects using query
1424         """
1425         bks = []
1426         tops = self.searcher.search(query, filt, max_results)
1427         for found in tops.scoreDocs:
1428             doc = self.searcher.doc(found.doc)
1429             try:
1430                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1431             except catalogue.models.Book.DoesNotExist: pass
1432         return bks
1433
1434     def make_prefix_phrase(self, toks, field):
1435         q = MultiPhraseQuery()
1436         for i in range(len(toks)):
1437             t = Term(field, toks[i])
1438             if i == len(toks) - 1:
1439                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1440                 if pterms:
1441                     q.add(pterms)
1442                 else:
1443                     q.add(t)
1444             else:
1445                 q.add(t)
1446         return q
1447
1448     @staticmethod
1449     def term_filter(term, inverse=False):
1450         only_term = TermsFilter()
1451         only_term.addTerm(term)
1452
1453         if inverse:
1454             neg = BooleanFilter()
1455             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1456             only_term = neg
1457
1458         return only_term
1459
1460     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1461         """
1462         Return auto-complete hints for tags
1463         using prefix search.
1464         """
1465         toks = self.get_tokens(string, field='SIMPLE')
1466         top = BooleanQuery()
1467
1468         for field in ['tag_name', 'tag_name_pl']:
1469             if prefix:
1470                 q = self.make_prefix_phrase(toks, field)
1471             else:
1472                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1473             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1474
1475         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1476
1477         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1478
1479     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1480         """
1481         Returns auto-complete hints for book titles
1482         Because we do not index 'pseudo' title-tags.
1483         Prefix search.
1484         """
1485         toks = self.get_tokens(string, field='SIMPLE')
1486
1487         if prefix:
1488             q = self.make_prefix_phrase(toks, 'title')
1489         else:
1490             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1491
1492         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1493
1494     @staticmethod
1495     def chain_filters(filters, op=ChainedFilter.AND):
1496         """
1497         Chains a filter list together
1498         """
1499         filters = filter(lambda x: x is not None, filters)
1500         if not filters or filters is []:
1501             return None
1502         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1503         return chf
1504
1505     def filtered_categories(self, tags):
1506         """
1507         Return a list of tag categories, present in tags list.
1508         """
1509         cats = {}
1510         for t in tags:
1511             cats[t.category] = True
1512         return cats.keys()
1513
1514     def hint(self):
1515         return Hint(self)