apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from django.dispatch import Signal
   5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
   6     File, Field, Integer, \
   7     NumericField, Version, Document, JavaError, IndexSearcher, \
   8     QueryParser, PerFieldAnalyzerWrapper, \
   9     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
  10     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  11     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  12     HashSet, BooleanClause, Term, CharTermAttribute, \
  13     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  14     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  15     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  16     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  17     initVM, CLASSPATH, JArray, JavaError
  18     # KeywordAnalyzer
  19
  20 # Initialize jvm
  21 JVM = initVM(CLASSPATH)
  22
  23 import sys
  24 import os
  25 import re
  26 import errno
  27 from librarian import dcparser
  28 from librarian.parser import WLDocument
  29 from lxml import etree
  30 import catalogue.models
  31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  32 from multiprocessing.pool import ThreadPool
  33 from threading import current_thread
  34 import atexit
  35 import traceback
  36
  37
  38 class WLAnalyzer(PerFieldAnalyzerWrapper):
  39     def __init__(self):
  40         polish = PolishAnalyzer(Version.LUCENE_34)
  41         #        polish_gap.setPositionIncrementGap(999)
  42
  43         simple = SimpleAnalyzer(Version.LUCENE_34)
  44         #        simple_gap.setPositionIncrementGap(999)
  45
  46         keyword = KeywordAnalyzer(Version.LUCENE_34)
  47
  48         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  49
  50         PerFieldAnalyzerWrapper.__init__(self, polish)
  51
  52         self.addAnalyzer("tags", simple)
  53         self.addAnalyzer("technical_editors", simple)
  54         self.addAnalyzer("editors", simple)
  55         self.addAnalyzer("url", keyword)
  56         self.addAnalyzer("source_url", keyword)
  57         self.addAnalyzer("source_name", simple)
  58         self.addAnalyzer("publisher", simple)
  59         self.addAnalyzer("authors", simple)
  60         self.addAnalyzer("title", simple)
  61
  62         self.addAnalyzer("is_book", keyword)
  63         # shouldn't the title have two forms? _pl and simple?
  64
  65         self.addAnalyzer("themes", simple)
  66         self.addAnalyzer("themes_pl", polish)
  67
  68         self.addAnalyzer("tag_name", simple)
  69         self.addAnalyzer("tag_name_pl", polish)
  70
  71         self.addAnalyzer("translators", simple)
  72
  73         self.addAnalyzer("KEYWORD", keyword)
  74         self.addAnalyzer("SIMPLE", simple)
  75         self.addAnalyzer("POLISH", polish)
  76
  77
  78 class IndexStore(object):
  79     """
  80     Provides access to search index.
  81
  82     self.store - lucene index directory
  83     """
  84     def __init__(self):
  85         self.make_index_dir()
  86         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
  87
  88     def make_index_dir(self):
  89         try:
  90             os.makedirs(settings.SEARCH_INDEX)
  91         except OSError as exc:
  92             if exc.errno == errno.EEXIST:
  93                 pass
  94             else: raise
  95
  96     def close(self):
  97         self.store.close()
  98
  99
 100 class IndexChecker(IndexStore):
 101     def __init__(self):
 102         IndexStore.__init__(self)
 103
 104     def check(self):
 105         checker = CheckIndex(self.store)
 106         status = checker.checkIndex()
 107         return status
 108
 109
 110 class Snippets(object):
 111     """
 112     This class manages snippet files for indexed object (book)
 113     the snippets are concatenated together, and their positions and
 114     lengths are kept in lucene index fields.
 115     """
 116     SNIPPET_DIR = "snippets"
 117
 118     def __init__(self, book_id, revision=None):
 119         try:
 120             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 121         except OSError as exc:
 122             if exc.errno == errno.EEXIST:
 123                 pass
 124             else: raise
 125         self.book_id = book_id
 126         self.revision = revision
 127         self.file = None
 128
 129     @property
 130     def path(self):
 131         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
 132         else: fn = "%d" % self.book_id
 133
 134         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 135
 136     def open(self, mode='r'):
 137         """
 138         Open the snippet file. Call .close() afterwards.
 139         """
 140         if not 'b' in mode:
 141             mode += 'b'
 142
 143         if 'w' in mode:
 144             if os.path.exists(self.path):
 145                 self.revision = 1
 146                 while True:
 147                     if not os.path.exists(self.path):
 148                         break
 149                     self.revision += 1
 150             print "using %s" % self.path
 151
 152         self.file = open(self.path, mode)
 153         self.position = 0
 154         return self
 155
 156     def add(self, snippet):
 157         """
 158         Append a snippet (unicode) to the snippet file.
 159         Return a (position, length) tuple
 160         """
 161         txt = snippet.encode('utf-8')
 162         l = len(txt)
 163         self.file.write(txt)
 164         pos = (self.position, l)
 165         self.position += l
 166         return pos
 167
 168     def get(self, pos):
 169         """
 170         Given a tuple of (position, length) return an unicode
 171         of the snippet stored there.
 172         """
 173         self.file.seek(pos[0], 0)
 174         txt = self.file.read(pos[1]).decode('utf-8')
 175         return txt
 176
 177     def close(self):
 178         """Close snippet file"""
 179         self.file.close()
 180
 181     def remove(self):
 182         self.revision = None
 183         try:
 184             os.unlink(self.path)
 185             self.revision = 0
 186             while True:
 187                 self.revision += 1
 188                 os.unlink(self.path)
 189         except OSError:
 190             pass
 191
 192
 193 class BaseIndex(IndexStore):
 194     """
 195     Base index class.
 196     Provides basic operations on index: opening, closing, optimizing.
 197     """
 198     def __init__(self, analyzer=None):
 199         super(BaseIndex, self).__init__()
 200         self.index = None
 201         if not analyzer:
 202             analyzer = WLAnalyzer()
 203         self.analyzer = analyzer
 204
 205     def open(self, timeout=None):
 206         if self.index:
 207             raise Exception("Index is already opened")
 208         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 209         if timeout:
 210             conf.setWriteLockTimeout(long(timeout))
 211         self.index = IndexWriter(self.store, conf)
 212         return self.index
 213
 214     def optimize(self):
 215         self.index.optimize()
 216
 217     def close(self):
 218         try:
 219             self.index.optimize()
 220         except JavaError, je:
 221             print "Error during optimize phase, check index: %s" % je
 222
 223         self.index.close()
 224         self.index = None
 225
 226         index_changed.send_robust(self)
 227
 228         super(BaseIndex, self).close()
 229
 230     def __enter__(self):
 231         self.open()
 232         return self
 233
 234     def __exit__(self, type, value, tb):
 235         self.close()
 236
 237
 238 index_changed = Signal()
 239
 240
 241 class Index(BaseIndex):
 242     """
 243     Class indexing books.
 244     """
 245     def __init__(self, analyzer=None):
 246         super(Index, self).__init__(analyzer)
 247
 248     def index_tags(self, *tags, **kw):
 249         """
 250         Re-index global tag list.
 251         Removes all tags from index, then index them again.
 252         Indexed fields include: id, name (with and without polish stems), category
 253         """
 254         remove_only = kw.get('remove_only', False)
 255         # first, remove tags from index.
 256         if tags:
 257             q = BooleanQuery()
 258             for tag in tags:
 259                 b_id_cat = BooleanQuery()
 260
 261                 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
 262                 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
 263
 264                 if isinstance(tag, PDCounterAuthor):
 265                     q_cat = TermQuery(Term('tag_category', 'pd_author'))
 266                 elif isinstance(tag, PDCounterBook):
 267                     q_cat = TermQuery(Term('tag_category', 'pd_book'))
 268                 else:
 269                     q_cat = TermQuery(Term('tag_category', tag.category))
 270                 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
 271
 272                 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
 273         else:  # all
 274             q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 275             self.index.deleteDocuments(q)
 276
 277         if not remove_only:
 278             # then add them [all or just one passed]
 279             if not tags:
 280                 tags = catalogue.models.Tag.objects.exclude(category='set') + \
 281                     PDCounterAuthor.objects.all() + \
 282                     PDCounterBook.objects.all()
 283
 284             for tag in tags:
 285                 if isinstance(tag, PDCounterAuthor):
 286                     doc = Document()
 287                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 288                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 289                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 290                     doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 291                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 292                     self.index.addDocument(doc)
 293                 elif isinstance(tag, PDCounterBook):
 294                     doc = Document()
 295                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 296                     doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 297                     doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 298                     doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 299                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 300                     self.index.addDocument(doc)
 301                 else:
 302                     doc = Document()
 303                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 304                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 305                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 306                     doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 307                     self.index.addDocument(doc)
 308
 309     def create_book_doc(self, book):
 310         """
 311         Create a lucene document referring book id.
 312         """
 313         doc = Document()
 314         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 315         if book.parent is not None:
 316             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 317         return doc
 318
 319     def remove_book(self, book, remove_snippets=True):
 320         """Removes a book from search index.
 321         book - Book instance."""
 322         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 323         self.index.deleteDocuments(q)
 324
 325         if remove_snippets:
 326             snippets = Snippets(book.id)
 327             snippets.remove()
 328
 329     def index_book(self, book, book_info=None, overwrite=True):
 330         """
 331         Indexes the book.
 332         Creates a lucene document for extracted metadata
 333         and calls self.index_content() to index the contents of the book.
 334         """
 335         if overwrite:
 336             # we don't remove snippets, since they might be still needed by
 337             # threads using not reopened index
 338             self.remove_book(book, remove_snippets=False)
 339
 340         book_doc = self.create_book_doc(book)
 341         meta_fields = self.extract_metadata(book, book_info)
 342         for f in meta_fields.values():
 343             if isinstance(f, list) or isinstance(f, tuple):
 344                 for elem in f:
 345                     book_doc.add(elem)
 346             else:
 347                 book_doc.add(f)
 348         self.index.addDocument(book_doc)
 349         del book_doc
 350
 351         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 352
 353     master_tags = [
 354         'opowiadanie',
 355         'powiesc',
 356         'dramat_wierszowany_l',
 357         'dramat_wierszowany_lp',
 358         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 359         'wywiad',
 360         ]
 361
 362     ignore_content_tags = [
 363         'uwaga', 'extra',
 364         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 365         'didaskalia',
 366         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 367         ]
 368
 369     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 370
 371     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 372
 373     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 374
 375     def extract_metadata(self, book, book_info=None):
 376         """
 377         Extract metadata from book and returns a map of fields keyed by fieldname
 378         """
 379         fields = {}
 380
 381         if book_info is None:
 382             book_info = dcparser.parse(open(book.xml_file.path))
 383
 384         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 385         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 386         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 387
 388         # validator, name
 389         for field in dcparser.BookInfo.FIELDS:
 390             if hasattr(book_info, field.name):
 391                 if not getattr(book_info, field.name):
 392                     continue
 393                 # since no type information is available, we use validator
 394                 type_indicator = field.validator
 395                 if type_indicator == dcparser.as_unicode:
 396                     s = getattr(book_info, field.name)
 397                     if field.multiple:
 398                         s = ', '.join(s)
 399                     try:
 400                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 401                     except JavaError as je:
 402                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 403                 elif type_indicator == dcparser.as_person:
 404                     p = getattr(book_info, field.name)
 405                     if isinstance(p, dcparser.Person):
 406                         persons = unicode(p)
 407                     else:
 408                         persons = ', '.join(map(unicode, p))
 409                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 410                 elif type_indicator == dcparser.as_date:
 411                     dt = getattr(book_info, field.name)
 412                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 413                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 414
 415         # get published date
 416         pd = None
 417         if hasattr(book_info, 'source_name') and book_info.source_name:
 418             match = self.published_date_re.search(book_info.source_name)
 419             if match is not None:
 420                 pd = str(match.groups()[0])
 421         if not pd: pd = ""
 422         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 423
 424         return fields
 425
 426     def add_gaps(self, fields, fieldname):
 427         """
 428         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 429         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 430         """
 431         def gap():
 432             while True:
 433                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 434         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 435
 436     def get_master(self, root):
 437         """
 438         Returns the first master tag from an etree.
 439         """
 440         for master in root.iter():
 441             if master.tag in self.master_tags:
 442                 return master
 443
 444     def index_content(self, book, book_fields=[]):
 445         """
 446         Walks the book XML and extract content from it.
 447         Adds parts for each header tag and for each fragment.
 448         """
 449         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 450         root = wld.edoc.getroot()
 451
 452         master = self.get_master(root)
 453         if master is None:
 454             return []
 455
 456         def walker(node, ignore_tags=[]):
 457
 458             if node.tag not in ignore_tags:
 459                 yield node, None, None
 460                 if node.text is not None:
 461                     yield None, node.text, None
 462                 for child in list(node):
 463                     for b, t, e in walker(child):
 464                         yield b, t, e
 465                 yield None, None, node
 466
 467             if node.tail is not None:
 468                 yield None, node.tail, None
 469             return
 470
 471         def fix_format(text):
 472             #            separator = [u" ", u"\t", u".", u";", u","]
 473             if isinstance(text, list):
 474                 # need to join it first
 475                 text = filter(lambda s: s is not None, content)
 476                 text = u' '.join(text)
 477                 # for i in range(len(text)):
 478                 #     if i > 0:
 479                 #         if text[i][0] not in separator\
 480                 #             and text[i - 1][-1] not in separator:
 481                 #          text.insert(i, u" ")
 482
 483             return re.sub("(?m)/$", "", text)
 484
 485         def add_part(snippets, **fields):
 486             doc = self.create_book_doc(book)
 487             for f in book_fields:
 488                 doc.add(f)
 489
 490             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 491             doc.add(NumericField("header_span", Field.Store.YES, True)\
 492                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 493             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 494
 495             print ">>[%s]>%s<<<" % (fields.get('fragment_anchor', ''), fields['content'])
 496
 497             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 498                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 499
 500             snip_pos = snippets.add(fields["content"])
 501             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 502             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 503             if snippets.revision:
 504                 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
 505
 506             if 'fragment_anchor' in fields:
 507                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 508                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 509
 510             if 'themes' in fields:
 511                 themes, themes_pl = zip(*[
 512                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 513                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 514                      for theme in fields['themes']])
 515
 516                 themes = self.add_gaps(themes, 'themes')
 517                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 518
 519                 for t in themes:
 520                     doc.add(t)
 521                 for t in themes_pl:
 522                     doc.add(t)
 523
 524             return doc
 525
 526         def give_me_utf8(s):
 527             if isinstance(s, unicode):
 528                 return s.encode('utf-8')
 529             else:
 530                 return s
 531
 532         fragments = {}
 533         snippets = Snippets(book.id).open('w')
 534         try:
 535             for header, position in zip(list(master), range(len(master))):
 536
 537                 if header.tag in self.skip_header_tags:
 538                     continue
 539                 if header.tag is etree.Comment:
 540                     continue
 541
 542                 # section content
 543                 content = []
 544                 footnote = []
 545
 546                 def all_content(text):
 547                     for frag in fragments.values():
 548                         frag['content'].append(text)
 549                     content.append(text)
 550                 handle_text = [all_content]
 551
 552
 553                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 554                     # handle footnotes
 555                     if start is not None and start.tag in self.footnote_tags:
 556                         footnote = []
 557                         def collect_footnote(t):
 558                             footnote.append(t)
 559                         handle_text.append(collect_footnote)
 560                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 561                         handle_text.pop()
 562                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 563                                        content=u''.join(footnote),
 564                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 565
 566                         self.index.addDocument(doc)
 567                         #print "@ footnote text: %s" % footnote
 568                         footnote = []
 569
 570                     # handle fragments and themes.
 571                     if start is not None and start.tag == 'begin':
 572                         fid = start.attrib['id'][1:]
 573                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 574
 575                     # themes for this fragment
 576                     elif start is not None and start.tag == 'motyw':
 577                         fid = start.attrib['id'][1:]
 578                         handle_text.append(None)
 579                         if start.text is not None:
 580                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 581                     elif end is not None and end.tag == 'motyw':
 582                         handle_text.pop()
 583
 584                     elif start is not None and start.tag == 'end':
 585                         fid = start.attrib['id'][1:]
 586                         if fid not in fragments:
 587                             continue  # a broken <end> node, skip it
 588                         frag = fragments[fid]
 589                         if frag['themes'] == []:
 590                             continue  # empty themes list.
 591                         del fragments[fid]
 592
 593                         doc = add_part(snippets,
 594                                        header_type=frag['start_header'],
 595                                        header_index=frag['start_section'],
 596                                        header_span=position - frag['start_section'] + 1,
 597                                        fragment_anchor=fid,
 598                                        content=fix_format(frag['content']),
 599                                        themes=frag['themes'])
 600                         #print '@ FRAG %s' % frag['content']
 601                         self.index.addDocument(doc)
 602
 603                         # Collect content.
 604
 605                     if text is not None and handle_text is not []:
 606                         hdl = handle_text[-1]
 607                         if hdl is not None:
 608                             hdl(text)
 609
 610                         # in the end, add a section text.
 611                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 612                                content=fix_format(content))
 613                 #print '@ CONTENT: %s' % fix_format(content)
 614
 615                 self.index.addDocument(doc)
 616
 617         finally:
 618             snippets.close()
 619
 620
 621 def log_exception_wrapper(f):
 622     def _wrap(*a):
 623         try:
 624             f(*a)
 625         except Exception, e:
 626             print("Error in indexing thread: %s" % e)
 627             traceback.print_exc()
 628             raise e
 629     return _wrap
 630
 631
 632 class ReusableIndex(Index):
 633     """
 634     Works like index, but does not close/optimize Lucene index
 635     until program exit (uses atexit hook).
 636     This is usefull for importbooks command.
 637
 638     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 639     """
 640     index = None
 641
 642     def open(self, analyzer=None, **kw):
 643         if ReusableIndex.index:
 644             self.index = ReusableIndex.index
 645         else:
 646             print("opening index")
 647             Index.open(self, analyzer, **kw)
 648             ReusableIndex.index = self.index
 649             atexit.register(ReusableIndex.close_reusable)
 650
 651     # def index_book(self, *args, **kw):
 652     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 653     #     ReusableIndex.pool_jobs.append(job)
 654
 655     @staticmethod
 656     def close_reusable():
 657         if ReusableIndex.index:
 658             print("closing index")
 659             ReusableIndex.index.optimize()
 660             ReusableIndex.index.close()
 661             ReusableIndex.index = None
 662
 663             index_changed.send_robust(None)
 664
 665     def close(self):
 666         if ReusableIndex.index:
 667             ReusableIndex.index.commit()
 668
 669
 670 class JoinSearch(object):
 671     """
 672     This mixin could be used to handle block join queries.
 673     (currently unused)
 674     """
 675     def __init__(self, *args, **kw):
 676         super(JoinSearch, self).__init__(*args, **kw)
 677
 678     def wrapjoins(self, query, fields=[]):
 679         """
 680         This functions modifies the query in a recursive way,
 681         so Term and Phrase Queries contained, which match
 682         provided fields are wrapped in a BlockJoinQuery,
 683         and so delegated to children documents.
 684         """
 685         if BooleanQuery.instance_(query):
 686             qs = BooleanQuery.cast_(query)
 687             for clause in qs:
 688                 clause = BooleanClause.cast_(clause)
 689                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 690             return qs
 691         else:
 692             termset = HashSet()
 693             query.extractTerms(termset)
 694             for t in termset:
 695                 t = Term.cast_(t)
 696                 if t.field() not in fields:
 697                     return query
 698             return BlockJoinQuery(query, self.parent_filter,
 699                                   BlockJoinQuery.ScoreMode.Total)
 700
 701     def bsearch(self, query, max_results=50):
 702         q = self.query(query)
 703         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 704
 705         tops = self.searcher.search(bjq, max_results)
 706         bks = []
 707         for found in tops.scoreDocs:
 708             doc = self.searcher.doc(found.doc)
 709             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 710         return (bks, tops.totalHits)
 711
 712
 713 class SearchResult(object):
 714     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 715         if tokens_cache is None: tokens_cache = {}
 716
 717         if score:
 718             self._score = score
 719         else:
 720             self._score = scoreDocs.score
 721
 722         self.boost = 1.0
 723
 724         self._hits = []
 725         self._processed_hits = None  # processed hits
 726
 727         stored = search.searcher.doc(scoreDocs.doc)
 728         self.book_id = int(stored.get("book_id"))
 729
 730         pd = stored.get("published_date")
 731         try:
 732             self.published_date = int(pd)
 733         except ValueError:
 734             self.published_date = 0
 735
 736         header_type = stored.get("header_type")
 737         # we have a content hit in some header of fragment
 738         if header_type is not None:
 739             sec = (header_type, int(stored.get("header_index")))
 740             header_span = stored.get('header_span')
 741             header_span = header_span is not None and int(header_span) or 1
 742
 743             fragment = stored.get("fragment_anchor")
 744
 745             if snippets:
 746                 snippets = snippets.replace("/\n", "\n")
 747             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 748
 749             self._hits.append(hit)
 750
 751         self.search = search
 752         self.searched = searched
 753         self.tokens_cache = tokens_cache
 754
 755     @property
 756     def score(self):
 757         return self._score * self.boost
 758
 759     def merge(self, other):
 760         if self.book_id != other.book_id:
 761             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 762         self._hits += other._hits
 763         if other.score > self.score:
 764             self._score = other._score
 765         return self
 766
 767     def get_book(self):
 768         if hasattr(self, '_book'):
 769             return self._book
 770         return catalogue.models.Book.objects.get(id=self.book_id)
 771
 772     book = property(get_book)
 773
 774     @property
 775     def hits(self):
 776         if self._processed_hits is not None:
 777             return self._processed_hits
 778
 779         POSITION = 0
 780         FRAGMENT = 1
 781         POSITION_INDEX = 1
 782         POSITION_SPAN = 2
 783         SCORE = 2
 784         OTHER = 3
 785
 786         # to sections and fragments
 787         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 788
 789         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 790
 791         # sections not covered by fragments
 792         sect = filter(lambda s: 0 == len(filter(
 793             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 794             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 795             frags)), sect)
 796
 797         hits = []
 798
 799         def remove_duplicates(lst, keyfn, compare):
 800             els = {}
 801             for e in lst:
 802                 eif = keyfn(e)
 803                 if eif in els:
 804                     if compare(els[eif], e) >= 1:
 805                         continue
 806                 els[eif] = e
 807             return els.values()
 808
 809         # remove fragments with duplicated fid's and duplicated snippets
 810         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
 811         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or hash(f),
 812                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
 813
 814         # remove duplicate sections
 815         sections = {}
 816
 817         for s in sect:
 818             si = s[POSITION][POSITION_INDEX]
 819             # skip existing
 820             if si in sections:
 821                 if sections[si]['score'] >= s[SCORE]:
 822                     continue
 823
 824             m = {'score': s[SCORE],
 825                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 826                  }
 827             m.update(s[OTHER])
 828             sections[si] = m
 829
 830         hits = sections.values()
 831
 832         for f in frags:
 833             try:
 834                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 835             except catalogue.models.Fragment.DoesNotExist:
 836                 # stale index
 837                 continue
 838
 839             # Figure out if we were searching for a token matching some word in theme name.
 840             themes = frag.tags.filter(category='theme')
 841             themes_hit = []
 842             if self.searched is not None:
 843                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 844                 for theme in themes:
 845                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 846                     for t in tokens:
 847                         if t in name_tokens:
 848                             if not theme in themes_hit:
 849                                 themes_hit.append(theme)
 850                             break
 851
 852             m = {'score': f[SCORE],
 853                  'fragment': frag,
 854                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 855                  'themes': themes,
 856                  'themes_hit': themes_hit
 857                  }
 858             m.update(f[OTHER])
 859             hits.append(m)
 860
 861         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 862
 863         self._processed_hits = hits
 864
 865         return hits
 866
 867     def __unicode__(self):
 868         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 869
 870     @staticmethod
 871     def aggregate(*result_lists):
 872         books = {}
 873         for rl in result_lists:
 874             for r in rl:
 875                 if r.book_id in books:
 876                     books[r.book_id].merge(r)
 877                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 878                 else:
 879                     books[r.book_id] = r
 880         return books.values()
 881
 882     def __cmp__(self, other):
 883         c = cmp(self.score, other.score)
 884         if c == 0:
 885             # this is inverted, because earlier date is better
 886             return cmp(other.published_date, self.published_date)
 887         else:
 888             return c
 889
 890
 891 class Hint(object):
 892     """
 893     Given some hint information (information we already know about)
 894     our search target - like author, title (specific book), epoch, genre, kind
 895     we can narrow down search using filters.
 896     """
 897     def __init__(self, search):
 898         """
 899         Accepts a Searcher instance.
 900         """
 901         self.search = search
 902         self.book_tags = {}
 903         self.part_tags = []
 904         self._books = []
 905
 906     def books(self, *books):
 907         """
 908         Give a hint that we search these books.
 909         """
 910         self._books = books
 911
 912     def tags(self, tags):
 913         """
 914         Give a hint that these Tag objects (a list of)
 915         is necessary.
 916         """
 917         for t in tags:
 918             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 919                 lst = self.book_tags.get(t.category, [])
 920                 lst.append(t)
 921                 self.book_tags[t.category] = lst
 922             if t.category in ['theme', 'theme_pl']:
 923                 self.part_tags.append(t)
 924
 925     def tag_filter(self, tags, field='tags'):
 926         """
 927         Given a lsit of tags and an optional field (but they are normally in tags field)
 928         returns a filter accepting only books with specific tags.
 929         """
 930         q = BooleanQuery()
 931
 932         for tag in tags:
 933             toks = self.search.get_tokens(tag.name, field=field)
 934             tag_phrase = PhraseQuery()
 935             for tok in toks:
 936                 tag_phrase.add(Term(field, tok))
 937             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 938
 939         return QueryWrapperFilter(q)
 940
 941     def book_filter(self):
 942         """
 943         Filters using book tags (all tag kinds except a theme)
 944         """
 945         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 946         if tags:
 947             return self.tag_filter(tags)
 948         else:
 949             return None
 950
 951     def part_filter(self):
 952         """
 953         This filter can be used to look for book parts.
 954         It filters on book id and/or themes.
 955         """
 956         fs = []
 957         if self.part_tags:
 958             fs.append(self.tag_filter(self.part_tags, field='themes'))
 959
 960         if self._books != []:
 961             bf = BooleanFilter()
 962             for b in self._books:
 963                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 964                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 965             fs.append(bf)
 966
 967         return Search.chain_filters(fs)
 968
 969     def should_search_for_book(self):
 970         return self._books == []
 971
 972     def just_search_in(self, all):
 973         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 974         some = []
 975         for field in all:
 976             if field == 'authors' and 'author' in self.book_tags:
 977                 continue
 978             if field == 'title' and self._books != []:
 979                 continue
 980             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 981                 continue
 982             some.append(field)
 983         return some
 984
 985
 986 class Search(IndexStore):
 987     """
 988     Search facilities.
 989     """
 990     def __init__(self, default_field="content"):
 991         IndexStore.__init__(self)
 992         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 993         # self.analyzer = WLAnalyzer()
 994         reader = IndexReader.open(self.store, True)
 995         self.searcher = IndexSearcher(reader)
 996         self.parser = QueryParser(Version.LUCENE_34, default_field,
 997                                   self.analyzer)
 998
 999         self.parent_filter = TermsFilter()
1000         self.parent_filter.addTerm(Term("is_book", "true"))
1001         index_changed.connect(self.reopen)
1002
1003     def close(self):
1004         reader = self.searcher.getIndexReader()
1005         self.searcher.close()
1006         reader.close()
1007         super(Search, self).close()
1008         index_changed.disconnect(self.reopen)
1009
1010     def reopen(self, **unused):
1011         reader = self.searcher.getIndexReader()
1012         rdr = reader.reopen()
1013         print "got signal to reopen index"
1014         if not rdr.equals(reader):
1015             print "will reopen index"
1016             oldsearch = self.searcher
1017             self.searcher = IndexSearcher(rdr)
1018             oldsearch.close()
1019             reader.close()
1020
1021     def query(self, query):
1022         """Parse query in default Lucene Syntax. (for humans)
1023         """
1024         return self.parser.parse(query)
1025
1026     def simple_search(self, query, max_results=50):
1027         """Runs a query for books using lucene syntax. (for humans)
1028         Returns (books, total_hits)
1029         """
1030
1031         tops = self.searcher.search(self.query(query), max_results)
1032         bks = []
1033         for found in tops.scoreDocs:
1034             doc = self.searcher.doc(found.doc)
1035             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1036         return (bks, tops.totalHits)
1037
1038     def get_tokens(self, searched, field='content', cached=None):
1039         """returns tokens analyzed by a proper (for a field) analyzer
1040         argument can be: StringReader, string/unicode, or tokens. In the last case
1041         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1042         """
1043         if cached is not None and field in cached:
1044             return cached[field]
1045
1046         if isinstance(searched, str) or isinstance(searched, unicode):
1047             searched = StringReader(searched)
1048         elif isinstance(searched, list):
1049             return searched
1050
1051         searched.reset()
1052         tokens = self.analyzer.reusableTokenStream(field, searched)
1053         toks = []
1054         while tokens.incrementToken():
1055             cta = tokens.getAttribute(CharTermAttribute.class_)
1056             toks.append(cta.toString())
1057
1058         if cached is not None:
1059             cached[field] = toks
1060
1061         return toks
1062
1063     def fuzziness(self, fuzzy):
1064         """Helper method to sanitize fuzziness"""
1065         if not fuzzy:
1066             return None
1067         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1068             return fuzzy
1069         else:
1070             return 0.5
1071
1072     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1073         """
1074         Return a PhraseQuery with a series of tokens.
1075         """
1076         if fuzzy:
1077             phrase = MultiPhraseQuery()
1078             for t in tokens:
1079                 term = Term(field, t)
1080                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1081                 fuzzterms = []
1082
1083                 while True:
1084                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
1085                     ft = fuzzterm.term()
1086                     if ft:
1087                         fuzzterms.append(ft)
1088                     if not fuzzterm.next(): break
1089                 if fuzzterms:
1090                     phrase.add(JArray('object')(fuzzterms, Term))
1091                 else:
1092                     phrase.add(term)
1093         else:
1094             phrase = PhraseQuery()
1095             phrase.setSlop(slop)
1096             for t in tokens:
1097                 term = Term(field, t)
1098                 phrase.add(term)
1099         return phrase
1100
1101     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1102         """
1103         Returns term queries joined by boolean query.
1104         modal - applies to boolean query
1105         fuzzy - should the query by fuzzy.
1106         """
1107         q = BooleanQuery()
1108         for t in tokens:
1109             term = Term(field, t)
1110             if fuzzy:
1111                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1112             else:
1113                 term = TermQuery(term)
1114             q.add(BooleanClause(term, modal))
1115         return q
1116
1117     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1118                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1119         if filters is None: filters = []
1120         if tokens_cache is None: tokens_cache = {}
1121
1122         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1123
1124         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1125         if book:
1126             filters.append(self.term_filter(Term('is_book', 'true')))
1127         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1128
1129         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1130
1131     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1132                     filters=None, tokens_cache=None, boost=None, snippets=True):
1133         if filters is None: filters = []
1134         if tokens_cache is None: tokens_cache = {}
1135
1136         if book:
1137             filters.append(self.term_filter(Term('is_book', 'true')))
1138
1139         query = BooleanQuery()
1140
1141         for fld in fields:
1142             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1143
1144             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1145                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1146
1147         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1148
1149         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1150                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1151
1152     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1153         """
1154         Search for perfect book matches. Just see if the query matches with some author or title,
1155         taking hints into account.
1156         """
1157         fields_to_search = ['authors', 'title']
1158         only_in = None
1159         if hint:
1160             if not hint.should_search_for_book():
1161                 return []
1162             fields_to_search = hint.just_search_in(fields_to_search)
1163             only_in = hint.book_filter()
1164
1165         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1166
1167         books = []
1168         for q in qrys:
1169             top = self.searcher.search(q,
1170                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1171                 max_results)
1172             for found in top.scoreDocs:
1173                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1174         return books
1175
1176     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1177         fields_to_search = ['tags', 'authors', 'title']
1178
1179         only_in = None
1180         if hint:
1181             if not hint.should_search_for_book():
1182                 return []
1183             fields_to_search = hint.just_search_in(fields_to_search)
1184             only_in = hint.book_filter()
1185
1186         tokens = self.get_tokens(searched, field='SIMPLE')
1187
1188         q = BooleanQuery()
1189
1190         for fld in fields_to_search:
1191             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1192                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1193
1194         books = []
1195         top = self.searcher.search(q,
1196                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1197             max_results)
1198         for found in top.scoreDocs:
1199             books.append(SearchResult(self, found, how_found="search_book"))
1200
1201         return books
1202
1203     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1204         """
1205         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1206         some part/fragment of the book.
1207         """
1208         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1209
1210         flt = None
1211         if hint:
1212             flt = hint.part_filter()
1213
1214         books = []
1215         for q in qrys:
1216             top = self.searcher.search(q,
1217                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1218                                                            flt]),
1219                                        max_results)
1220             for found in top.scoreDocs:
1221                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1222
1223         return books
1224
1225     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1226         """
1227         Tries to use search terms to match different fields of book (or its parts).
1228         E.g. one word can be an author survey, another be a part of the title, and the rest
1229         are some words from third chapter.
1230         """
1231         if tokens_cache is None: tokens_cache = {}
1232         books = []
1233         only_in = None
1234
1235         if hint:
1236             only_in = hint.part_filter()
1237
1238         # content only query : themes x content
1239         q = BooleanQuery()
1240
1241         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1242         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1243
1244         # only search in themes when we do not already filter by themes
1245         if hint is None or hint.just_search_in(['themes']) != []:
1246             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1247                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1248
1249         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1250                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1251
1252         topDocs = self.searcher.search(q, only_in, max_results)
1253         for found in topDocs.scoreDocs:
1254             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1255             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1256
1257         # query themes/content x author/title/tags
1258         q = BooleanQuery()
1259         in_content = BooleanQuery()
1260         in_meta = BooleanQuery()
1261
1262         for fld in ['themes_pl', 'content']:
1263             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1264
1265         for fld in ['tags', 'authors', 'title']:
1266             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1267
1268         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1269         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1270
1271         topDocs = self.searcher.search(q, only_in, max_results)
1272         for found in topDocs.scoreDocs:
1273             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1274             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1275
1276         return books
1277
1278     # def multisearch(self, query, max_results=50):
1279     #     """
1280     #     Search strategy:
1281     #     - (phrase) OR -> content
1282     #                   -> title
1283     #                   -> authors
1284     #     - (keywords)  -> authors
1285     #                   -> motyw
1286     #                   -> tags
1287     #                   -> content
1288     #     """
1289         # queryreader = StringReader(query)
1290         # tokens = self.get_tokens(queryreader)
1291
1292         # top_level = BooleanQuery()
1293         # Should = BooleanClause.Occur.SHOULD
1294
1295         # phrase_level = BooleanQuery()
1296         # phrase_level.setBoost(1.3)
1297
1298         # p_content = self.make_phrase(tokens, joined=True)
1299         # p_title = self.make_phrase(tokens, 'title')
1300         # p_author = self.make_phrase(tokens, 'author')
1301
1302         # phrase_level.add(BooleanClause(p_content, Should))
1303         # phrase_level.add(BooleanClause(p_title, Should))
1304         # phrase_level.add(BooleanClause(p_author, Should))
1305
1306         # kw_level = BooleanQuery()
1307
1308         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1309         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1310         # kw_level.add(j_themes, Should)
1311         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1312         # j_con = self.make_term_query(tokens, joined=True)
1313         # kw_level.add(j_con, Should)
1314
1315         # top_level.add(BooleanClause(phrase_level, Should))
1316         # top_level.add(BooleanClause(kw_level, Should))
1317
1318         # return None
1319
1320     def get_snippets(self, scoreDoc, query, field='content'):
1321         """
1322         Returns a snippet for found scoreDoc.
1323         """
1324         htmlFormatter = SimpleHTMLFormatter()
1325         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1326
1327         stored = self.searcher.doc(scoreDoc.doc)
1328
1329         position = stored.get('snippets_position')
1330         length = stored.get('snippets_length')
1331         if position is None or length is None:
1332             return None
1333         revision = stored.get('snippets_revision')
1334         if revision: revision = int(revision)
1335         # locate content.
1336         book_id = int(stored.get('book_id'))
1337         snippets = Snippets(book_id, revision=revision).open()
1338         try:
1339             try:
1340                 text = snippets.get((int(position),
1341                                      int(length)))
1342             finally:
1343                 snippets.close()
1344
1345             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1346             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1347             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1348
1349         except Exception, e:
1350             e2 = e
1351             if hasattr(e, 'getJavaException'):
1352                 e2 = unicode(e.getJavaException())
1353             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1354                 e2)
1355         return snip
1356
1357     @staticmethod
1358     def enum_to_array(enum):
1359         """
1360         Converts a lucene TermEnum to array of Terms, suitable for
1361         addition to queries
1362         """
1363         terms = []
1364
1365         while True:
1366             t = enum.term()
1367             if t:
1368                 terms.append(t)
1369             if not enum.next(): break
1370
1371         if terms:
1372             return JArray('object')(terms, Term)
1373
1374     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1375         """
1376         Search for Tag objects using query.
1377         """
1378         if not pdcounter:
1379             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1380         tops = self.searcher.search(query, filters, max_results)
1381
1382         tags = []
1383         for found in tops.scoreDocs:
1384             doc = self.searcher.doc(found.doc)
1385             is_pdcounter = doc.get('is_pdcounter')
1386             category = doc.get('tag_category')
1387             try:
1388                 if is_pdcounter == 'true':
1389                     if category == 'pd_author':
1390                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1391                     elif category == 'pd_book':
1392                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1393                         tag.category = 'pd_book'  # make it look more lik a tag.
1394                     else:
1395                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1396                 else:
1397                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1398                     # don't add the pdcounter tag if same tag already exists
1399                 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1400                     tags.append(tag)
1401             except catalogue.models.Tag.DoesNotExist: pass
1402             except PDCounterAuthor.DoesNotExist: pass
1403             except PDCounterBook.DoesNotExist: pass
1404
1405                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1406         print 'returning %s' % tags
1407         return tags
1408
1409     def search_books(self, query, filter=None, max_results=10):
1410         """
1411         Searches for Book objects using query
1412         """
1413         bks = []
1414         tops = self.searcher.search(query, filter, max_results)
1415         for found in tops.scoreDocs:
1416             doc = self.searcher.doc(found.doc)
1417             try:
1418                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1419             except catalogue.models.Book.DoesNotExist: pass
1420         return bks
1421
1422     def make_prefix_phrase(self, toks, field):
1423         q = MultiPhraseQuery()
1424         for i in range(len(toks)):
1425             t = Term(field, toks[i])
1426             if i == len(toks) - 1:
1427                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1428                 if pterms:
1429                     q.add(pterms)
1430                 else:
1431                     q.add(t)
1432             else:
1433                 q.add(t)
1434         return q
1435
1436     @staticmethod
1437     def term_filter(term, inverse=False):
1438         only_term = TermsFilter()
1439         only_term.addTerm(term)
1440
1441         if inverse:
1442             neg = BooleanFilter()
1443             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1444             only_term = neg
1445
1446         return only_term
1447
1448     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1449         """
1450         Return auto-complete hints for tags
1451         using prefix search.
1452         """
1453         toks = self.get_tokens(string, field='SIMPLE')
1454         top = BooleanQuery()
1455
1456         for field in ['tag_name', 'tag_name_pl']:
1457             if prefix:
1458                 q = self.make_prefix_phrase(toks, field)
1459             else:
1460                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1461             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1462
1463         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1464
1465         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1466
1467     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1468         """
1469         Returns auto-complete hints for book titles
1470         Because we do not index 'pseudo' title-tags.
1471         Prefix search.
1472         """
1473         toks = self.get_tokens(string, field='SIMPLE')
1474
1475         if prefix:
1476             q = self.make_prefix_phrase(toks, 'title')
1477         else:
1478             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1479
1480         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1481
1482     @staticmethod
1483     def chain_filters(filters, op=ChainedFilter.AND):
1484         """
1485         Chains a filter list together
1486         """
1487         filters = filter(lambda x: x is not None, filters)
1488         if not filters or filters is []:
1489             return None
1490         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1491         return chf
1492
1493     def filtered_categories(self, tags):
1494         """
1495         Return a list of tag categories, present in tags list.
1496         """
1497         cats = {}
1498         for t in tags:
1499             cats[t.category] = True
1500         return cats.keys()
1501
1502     def hint(self):
1503         return Hint(self)