apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from django.dispatch import Signal
   5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
   6     File, Field, Integer, \
   7     NumericField, Version, Document, JavaError, IndexSearcher, \
   8     QueryParser, PerFieldAnalyzerWrapper, \
   9     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
  10     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  11     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  12     HashSet, BooleanClause, Term, CharTermAttribute, \
  13     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  14     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  15     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  16     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  17     initVM, CLASSPATH, JArray, JavaError
  18     # KeywordAnalyzer
  19
  20 # Initialize jvm
  21 JVM = initVM(CLASSPATH)
  22
  23 import sys
  24 import os
  25 import re
  26 import errno
  27 from librarian import dcparser
  28 from librarian.parser import WLDocument
  29 from lxml import etree
  30 import catalogue.models
  31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  32 from multiprocessing.pool import ThreadPool
  33 from threading import current_thread
  34 from itertools import chain
  35 import atexit
  36 import traceback
  37 import logging
  38 log = logging.getLogger('search')
  39
  40 class WLAnalyzer(PerFieldAnalyzerWrapper):
  41     def __init__(self):
  42         polish = PolishAnalyzer(Version.LUCENE_34)
  43         #        polish_gap.setPositionIncrementGap(999)
  44
  45         simple = SimpleAnalyzer(Version.LUCENE_34)
  46         #        simple_gap.setPositionIncrementGap(999)
  47
  48         keyword = KeywordAnalyzer(Version.LUCENE_34)
  49
  50         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  51
  52         PerFieldAnalyzerWrapper.__init__(self, polish)
  53
  54         self.addAnalyzer("tags", simple)
  55         self.addAnalyzer("technical_editors", simple)
  56         self.addAnalyzer("editors", simple)
  57         self.addAnalyzer("url", keyword)
  58         self.addAnalyzer("source_url", keyword)
  59         self.addAnalyzer("source_name", simple)
  60         self.addAnalyzer("publisher", simple)
  61         self.addAnalyzer("authors", simple)
  62         self.addAnalyzer("title", simple)
  63
  64         self.addAnalyzer("is_book", keyword)
  65         # shouldn't the title have two forms? _pl and simple?
  66
  67         self.addAnalyzer("themes", simple)
  68         self.addAnalyzer("themes_pl", polish)
  69
  70         self.addAnalyzer("tag_name", simple)
  71         self.addAnalyzer("tag_name_pl", polish)
  72
  73         self.addAnalyzer("translators", simple)
  74
  75         self.addAnalyzer("KEYWORD", keyword)
  76         self.addAnalyzer("SIMPLE", simple)
  77         self.addAnalyzer("POLISH", polish)
  78
  79
  80 class IndexStore(object):
  81     """
  82     Provides access to search index.
  83
  84     self.store - lucene index directory
  85     """
  86     def __init__(self):
  87         self.make_index_dir()
  88         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
  89
  90     def make_index_dir(self):
  91         try:
  92             os.makedirs(settings.SEARCH_INDEX)
  93         except OSError as exc:
  94             if exc.errno == errno.EEXIST:
  95                 pass
  96             else: raise
  97
  98     def close(self):
  99         self.store.close()
 100
 101
 102 class IndexChecker(IndexStore):
 103     def __init__(self):
 104         IndexStore.__init__(self)
 105
 106     def check(self):
 107         checker = CheckIndex(self.store)
 108         status = checker.checkIndex()
 109         return status
 110
 111
 112 class Snippets(object):
 113     """
 114     This class manages snippet files for indexed object (book)
 115     the snippets are concatenated together, and their positions and
 116     lengths are kept in lucene index fields.
 117     """
 118     SNIPPET_DIR = "snippets"
 119
 120     def __init__(self, book_id, revision=None):
 121         try:
 122             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 123         except OSError as exc:
 124             if exc.errno == errno.EEXIST:
 125                 pass
 126             else: raise
 127         self.book_id = book_id
 128         self.revision = revision
 129         self.file = None
 130
 131     @property
 132     def path(self):
 133         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
 134         else: fn = "%d" % self.book_id
 135
 136         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 137
 138     def open(self, mode='r'):
 139         """
 140         Open the snippet file. Call .close() afterwards.
 141         """
 142         if not 'b' in mode:
 143             mode += 'b'
 144
 145         if 'w' in mode:
 146             if os.path.exists(self.path):
 147                 self.revision = 1
 148                 while True:
 149                     if not os.path.exists(self.path):
 150                         break
 151                     self.revision += 1
 152
 153         self.file = open(self.path, mode)
 154         self.position = 0
 155         return self
 156
 157     def add(self, snippet):
 158         """
 159         Append a snippet (unicode) to the snippet file.
 160         Return a (position, length) tuple
 161         """
 162         txt = snippet.encode('utf-8')
 163         l = len(txt)
 164         self.file.write(txt)
 165         pos = (self.position, l)
 166         self.position += l
 167         return pos
 168
 169     def get(self, pos):
 170         """
 171         Given a tuple of (position, length) return an unicode
 172         of the snippet stored there.
 173         """
 174         self.file.seek(pos[0], 0)
 175         txt = self.file.read(pos[1]).decode('utf-8')
 176         return txt
 177
 178     def close(self):
 179         """Close snippet file"""
 180         self.file.close()
 181
 182     def remove(self):
 183         self.revision = None
 184         try:
 185             os.unlink(self.path)
 186             self.revision = 0
 187             while True:
 188                 self.revision += 1
 189                 os.unlink(self.path)
 190         except OSError:
 191             pass
 192
 193
 194 class BaseIndex(IndexStore):
 195     """
 196     Base index class.
 197     Provides basic operations on index: opening, closing, optimizing.
 198     """
 199     def __init__(self, analyzer=None):
 200         super(BaseIndex, self).__init__()
 201         self.index = None
 202         if not analyzer:
 203             analyzer = WLAnalyzer()
 204         self.analyzer = analyzer
 205
 206     def open(self, timeout=None):
 207         if self.index:
 208             raise Exception("Index is already opened")
 209         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 210         if timeout:
 211             conf.setWriteLockTimeout(long(timeout))
 212         self.index = IndexWriter(self.store, conf)
 213         return self.index
 214
 215     def optimize(self):
 216         self.index.optimize()
 217
 218     def close(self):
 219         try:
 220             self.index.optimize()
 221         except JavaError, je:
 222             log.error("Error during optimize phase, check index: %s" % je)
 223
 224         self.index.close()
 225         self.index = None
 226
 227         index_changed.send_robust(self)
 228
 229         super(BaseIndex, self).close()
 230
 231     def __enter__(self):
 232         self.open()
 233         return self
 234
 235     def __exit__(self, type, value, tb):
 236         self.close()
 237
 238
 239 index_changed = Signal()
 240
 241
 242 class Index(BaseIndex):
 243     """
 244     Class indexing books.
 245     """
 246     def __init__(self, analyzer=None):
 247         super(Index, self).__init__(analyzer)
 248
 249     def index_tags(self, *tags, **kw):
 250         """
 251         Re-index global tag list.
 252         Removes all tags from index, then index them again.
 253         Indexed fields include: id, name (with and without polish stems), category
 254         """
 255         remove_only = kw.get('remove_only', False)
 256         # first, remove tags from index.
 257         if tags:
 258             q = BooleanQuery()
 259             for tag in tags:
 260                 b_id_cat = BooleanQuery()
 261
 262                 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
 263                 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
 264
 265                 if isinstance(tag, PDCounterAuthor):
 266                     q_cat = TermQuery(Term('tag_category', 'pd_author'))
 267                 elif isinstance(tag, PDCounterBook):
 268                     q_cat = TermQuery(Term('tag_category', 'pd_book'))
 269                 else:
 270                     q_cat = TermQuery(Term('tag_category', tag.category))
 271                 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
 272
 273                 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
 274         else:  # all
 275             q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 276             self.index.deleteDocuments(q)
 277
 278         if not remove_only:
 279             # then add them [all or just one passed]
 280             if not tags:
 281                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 282                     PDCounterAuthor.objects.all(), \
 283                     PDCounterBook.objects.all())
 284
 285             for tag in tags:
 286                 if isinstance(tag, PDCounterAuthor):
 287                     doc = Document()
 288                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 289                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 290                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 291                     doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 292                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 293                     self.index.addDocument(doc)
 294                 elif isinstance(tag, PDCounterBook):
 295                     doc = Document()
 296                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 297                     doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 298                     doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 299                     doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 300                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 301                     self.index.addDocument(doc)
 302                 else:
 303                     doc = Document()
 304                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 305                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 306                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 307                     doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 308                     self.index.addDocument(doc)
 309
 310     def create_book_doc(self, book):
 311         """
 312         Create a lucene document referring book id.
 313         """
 314         doc = Document()
 315         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 316         if book.parent is not None:
 317             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 318         return doc
 319
 320     def remove_book(self, book_or_id, remove_snippets=True):
 321         """Removes a book from search index.
 322         book - Book instance."""
 323         if isinstance(book_or_id, catalogue.models.Book):
 324             book_id = book_or_id.id
 325         else:
 326             book_id = book_or_id
 327
 328         q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
 329         self.index.deleteDocuments(q)
 330
 331         if remove_snippets:
 332             snippets = Snippets(book_id)
 333             snippets.remove()
 334
 335     def index_book(self, book, book_info=None, overwrite=True):
 336         """
 337         Indexes the book.
 338         Creates a lucene document for extracted metadata
 339         and calls self.index_content() to index the contents of the book.
 340         """
 341         if overwrite:
 342             # we don't remove snippets, since they might be still needed by
 343             # threads using not reopened index
 344             self.remove_book(book, remove_snippets=False)
 345
 346         book_doc = self.create_book_doc(book)
 347         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 348         # let's not index it - it's only used for extracting publish date
 349         if 'source_name' in meta_fields:
 350             del meta_fields['source_name']
 351
 352         for f in meta_fields.values():
 353             if isinstance(f, list) or isinstance(f, tuple):
 354                 for elem in f:
 355                     book_doc.add(elem)
 356             else:
 357                 book_doc.add(f)
 358         self.index.addDocument(book_doc)
 359         del book_doc
 360
 361         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 362
 363     master_tags = [
 364         'opowiadanie',
 365         'powiesc',
 366         'dramat_wierszowany_l',
 367         'dramat_wierszowany_lp',
 368         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 369         'wywiad',
 370         ]
 371
 372     ignore_content_tags = [
 373         'uwaga', 'extra',
 374         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 375         'didaskalia',
 376         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 377         ]
 378
 379     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 380
 381     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 382
 383     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 384
 385     def extract_metadata(self, book, book_info=None, dc_only=None):
 386         """
 387         Extract metadata from book and returns a map of fields keyed by fieldname
 388         """
 389         fields = {}
 390
 391         if book_info is None:
 392             book_info = dcparser.parse(open(book.xml_file.path))
 393
 394         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 395         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 396         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 397
 398         # validator, name
 399         for field in dcparser.BookInfo.FIELDS:
 400             if dc_only and field.name not in dc_only:
 401                 continue
 402             if hasattr(book_info, field.name):
 403                 if not getattr(book_info, field.name):
 404                     continue
 405                 # since no type information is available, we use validator
 406                 type_indicator = field.validator
 407                 if type_indicator == dcparser.as_unicode:
 408                     s = getattr(book_info, field.name)
 409                     if field.multiple:
 410                         s = ', '.join(s)
 411                     try:
 412                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 413                     except JavaError as je:
 414                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 415                 elif type_indicator == dcparser.as_person:
 416                     p = getattr(book_info, field.name)
 417                     if isinstance(p, dcparser.Person):
 418                         persons = unicode(p)
 419                     else:
 420                         persons = ', '.join(map(unicode, p))
 421                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 422                 elif type_indicator == dcparser.as_date:
 423                     dt = getattr(book_info, field.name)
 424                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 425                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 426
 427         # get published date
 428         pd = None
 429         if hasattr(book_info, 'source_name') and book_info.source_name:
 430             match = self.published_date_re.search(book_info.source_name)
 431             if match is not None:
 432                 pd = str(match.groups()[0])
 433         if not pd: pd = ""
 434         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 435
 436         return fields
 437
 438     def add_gaps(self, fields, fieldname):
 439         """
 440         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 441         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 442         """
 443         def gap():
 444             while True:
 445                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 446         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 447
 448     def get_master(self, root):
 449         """
 450         Returns the first master tag from an etree.
 451         """
 452         for master in root.iter():
 453             if master.tag in self.master_tags:
 454                 return master
 455
 456     def index_content(self, book, book_fields=[]):
 457         """
 458         Walks the book XML and extract content from it.
 459         Adds parts for each header tag and for each fragment.
 460         """
 461         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 462         root = wld.edoc.getroot()
 463
 464         master = self.get_master(root)
 465         if master is None:
 466             return []
 467
 468         def walker(node, ignore_tags=[]):
 469
 470             if node.tag not in ignore_tags:
 471                 yield node, None, None
 472                 if node.text is not None:
 473                     yield None, node.text, None
 474                 for child in list(node):
 475                     for b, t, e in walker(child):
 476                         yield b, t, e
 477                 yield None, None, node
 478
 479             if node.tail is not None:
 480                 yield None, node.tail, None
 481             return
 482
 483         def fix_format(text):
 484             #            separator = [u" ", u"\t", u".", u";", u","]
 485             if isinstance(text, list):
 486                 # need to join it first
 487                 text = filter(lambda s: s is not None, content)
 488                 text = u' '.join(text)
 489                 # for i in range(len(text)):
 490                 #     if i > 0:
 491                 #         if text[i][0] not in separator\
 492                 #             and text[i - 1][-1] not in separator:
 493                 #          text.insert(i, u" ")
 494
 495             return re.sub("(?m)/$", "", text)
 496
 497         def add_part(snippets, **fields):
 498             doc = self.create_book_doc(book)
 499             for f in book_fields:
 500                 doc.add(f)
 501
 502             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 503             doc.add(NumericField("header_span", Field.Store.YES, True)\
 504                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 505             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 506
 507             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 508                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 509
 510             snip_pos = snippets.add(fields["content"])
 511             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 512             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 513             if snippets.revision:
 514                 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
 515
 516             if 'fragment_anchor' in fields:
 517                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 518                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 519
 520             if 'themes' in fields:
 521                 themes, themes_pl = zip(*[
 522                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 523                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 524                      for theme in fields['themes']])
 525
 526                 themes = self.add_gaps(themes, 'themes')
 527                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 528
 529                 for t in themes:
 530                     doc.add(t)
 531                 for t in themes_pl:
 532                     doc.add(t)
 533
 534             return doc
 535
 536         def give_me_utf8(s):
 537             if isinstance(s, unicode):
 538                 return s.encode('utf-8')
 539             else:
 540                 return s
 541
 542         fragments = {}
 543         snippets = Snippets(book.id).open('w')
 544         try:
 545             for header, position in zip(list(master), range(len(master))):
 546
 547                 if header.tag in self.skip_header_tags:
 548                     continue
 549                 if header.tag is etree.Comment:
 550                     continue
 551
 552                 # section content
 553                 content = []
 554                 footnote = []
 555
 556                 def all_content(text):
 557                     for frag in fragments.values():
 558                         frag['content'].append(text)
 559                     content.append(text)
 560                 handle_text = [all_content]
 561
 562
 563                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 564                     # handle footnotes
 565                     if start is not None and start.tag in self.footnote_tags:
 566                         footnote = []
 567                         def collect_footnote(t):
 568                             footnote.append(t)
 569                         handle_text.append(collect_footnote)
 570                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 571                         handle_text.pop()
 572                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 573                                        content=u''.join(footnote),
 574                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 575
 576                         self.index.addDocument(doc)
 577                         #print "@ footnote text: %s" % footnote
 578                         footnote = []
 579
 580                     # handle fragments and themes.
 581                     if start is not None and start.tag == 'begin':
 582                         fid = start.attrib['id'][1:]
 583                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 584
 585                     # themes for this fragment
 586                     elif start is not None and start.tag == 'motyw':
 587                         fid = start.attrib['id'][1:]
 588                         handle_text.append(None)
 589                         if start.text is not None:
 590                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 591                     elif end is not None and end.tag == 'motyw':
 592                         handle_text.pop()
 593
 594                     elif start is not None and start.tag == 'end':
 595                         fid = start.attrib['id'][1:]
 596                         if fid not in fragments:
 597                             continue  # a broken <end> node, skip it
 598                         frag = fragments[fid]
 599                         if frag['themes'] == []:
 600                             continue  # empty themes list.
 601                         del fragments[fid]
 602
 603                         doc = add_part(snippets,
 604                                        header_type=frag['start_header'],
 605                                        header_index=frag['start_section'],
 606                                        header_span=position - frag['start_section'] + 1,
 607                                        fragment_anchor=fid,
 608                                        content=fix_format(frag['content']),
 609                                        themes=frag['themes'])
 610                         #print '@ FRAG %s' % frag['content']
 611                         self.index.addDocument(doc)
 612
 613                         # Collect content.
 614
 615                     if text is not None and handle_text is not []:
 616                         hdl = handle_text[-1]
 617                         if hdl is not None:
 618                             hdl(text)
 619
 620                         # in the end, add a section text.
 621                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 622                                content=fix_format(content))
 623                 #print '@ CONTENT: %s' % fix_format(content)
 624
 625                 self.index.addDocument(doc)
 626
 627         finally:
 628             snippets.close()
 629
 630
 631 def log_exception_wrapper(f):
 632     def _wrap(*a):
 633         try:
 634             f(*a)
 635         except Exception, e:
 636             log.error("Error in indexing thread: %s" % e)
 637             traceback.print_exc()
 638             raise e
 639     return _wrap
 640
 641
 642 class ReusableIndex(Index):
 643     """
 644     Works like index, but does not close/optimize Lucene index
 645     until program exit (uses atexit hook).
 646     This is usefull for importbooks command.
 647
 648     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 649     """
 650     index = None
 651
 652     def open(self, analyzer=None, **kw):
 653         if ReusableIndex.index:
 654             self.index = ReusableIndex.index
 655         else:
 656             Index.open(self, analyzer, **kw)
 657             ReusableIndex.index = self.index
 658             atexit.register(ReusableIndex.close_reusable)
 659
 660     # def index_book(self, *args, **kw):
 661     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 662     #     ReusableIndex.pool_jobs.append(job)
 663
 664     @staticmethod
 665     def close_reusable():
 666         if ReusableIndex.index:
 667             ReusableIndex.index.optimize()
 668             ReusableIndex.index.close()
 669             ReusableIndex.index = None
 670
 671             index_changed.send_robust(None)
 672
 673     def close(self):
 674         if ReusableIndex.index:
 675             ReusableIndex.index.commit()
 676
 677
 678 class JoinSearch(object):
 679     """
 680     This mixin could be used to handle block join queries.
 681     (currently unused)
 682     """
 683     def __init__(self, *args, **kw):
 684         super(JoinSearch, self).__init__(*args, **kw)
 685
 686     def wrapjoins(self, query, fields=[]):
 687         """
 688         This functions modifies the query in a recursive way,
 689         so Term and Phrase Queries contained, which match
 690         provided fields are wrapped in a BlockJoinQuery,
 691         and so delegated to children documents.
 692         """
 693         if BooleanQuery.instance_(query):
 694             qs = BooleanQuery.cast_(query)
 695             for clause in qs:
 696                 clause = BooleanClause.cast_(clause)
 697                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 698             return qs
 699         else:
 700             termset = HashSet()
 701             query.extractTerms(termset)
 702             for t in termset:
 703                 t = Term.cast_(t)
 704                 if t.field() not in fields:
 705                     return query
 706             return BlockJoinQuery(query, self.parent_filter,
 707                                   BlockJoinQuery.ScoreMode.Total)
 708
 709     def bsearch(self, query, max_results=50):
 710         q = self.query(query)
 711         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 712
 713         tops = self.searcher.search(bjq, max_results)
 714         bks = []
 715         for found in tops.scoreDocs:
 716             doc = self.searcher.doc(found.doc)
 717             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 718         return (bks, tops.totalHits)
 719
 720
 721 class SearchResult(object):
 722     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 723         if tokens_cache is None: tokens_cache = {}
 724
 725         if score:
 726             self._score = score
 727         else:
 728             self._score = scoreDocs.score
 729
 730         self.boost = 1.0
 731
 732         self._hits = []
 733         self._processed_hits = None  # processed hits
 734
 735         stored = search.searcher.doc(scoreDocs.doc)
 736         self.book_id = int(stored.get("book_id"))
 737
 738         pd = stored.get("published_date")
 739         try:
 740             self.published_date = int(pd)
 741         except ValueError:
 742             self.published_date = 0
 743
 744         header_type = stored.get("header_type")
 745         # we have a content hit in some header of fragment
 746         if header_type is not None:
 747             sec = (header_type, int(stored.get("header_index")))
 748             header_span = stored.get('header_span')
 749             header_span = header_span is not None and int(header_span) or 1
 750
 751             fragment = stored.get("fragment_anchor")
 752
 753             if snippets:
 754                 snippets = snippets.replace("/\n", "\n")
 755             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 756
 757             self._hits.append(hit)
 758
 759         self.search = search
 760         self.searched = searched
 761         self.tokens_cache = tokens_cache
 762
 763     @property
 764     def score(self):
 765         return self._score * self.boost
 766
 767     def merge(self, other):
 768         if self.book_id != other.book_id:
 769             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 770         self._hits += other._hits
 771         if other.score > self.score:
 772             self._score = other._score
 773         return self
 774
 775     def get_book(self):
 776         if hasattr(self, '_book'):
 777             return self._book
 778         return catalogue.models.Book.objects.get(id=self.book_id)
 779
 780     book = property(get_book)
 781
 782     @property
 783     def hits(self):
 784         if self._processed_hits is not None:
 785             return self._processed_hits
 786
 787         POSITION = 0
 788         FRAGMENT = 1
 789         POSITION_INDEX = 1
 790         POSITION_SPAN = 2
 791         SCORE = 2
 792         OTHER = 3
 793
 794         # to sections and fragments
 795         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 796
 797         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 798
 799         # sections not covered by fragments
 800         sect = filter(lambda s: 0 == len(filter(
 801             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 802             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 803             frags)), sect)
 804
 805         hits = []
 806
 807         def remove_duplicates(lst, keyfn, compare):
 808             els = {}
 809             for e in lst:
 810                 eif = keyfn(e)
 811                 if eif in els:
 812                     if compare(els[eif], e) >= 1:
 813                         continue
 814                 els[eif] = e
 815             return els.values()
 816
 817         # remove fragments with duplicated fid's and duplicated snippets
 818         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
 819         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
 820                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
 821
 822         # remove duplicate sections
 823         sections = {}
 824
 825         for s in sect:
 826             si = s[POSITION][POSITION_INDEX]
 827             # skip existing
 828             if si in sections:
 829                 if sections[si]['score'] >= s[SCORE]:
 830                     continue
 831
 832             m = {'score': s[SCORE],
 833                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 834                  }
 835             m.update(s[OTHER])
 836             sections[si] = m
 837
 838         hits = sections.values()
 839
 840         for f in frags:
 841             try:
 842                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 843             except catalogue.models.Fragment.DoesNotExist:
 844                 # stale index
 845                 continue
 846
 847             # Figure out if we were searching for a token matching some word in theme name.
 848             themes = frag.tags.filter(category='theme')
 849             themes_hit = []
 850             if self.searched is not None:
 851                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 852                 for theme in themes:
 853                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 854                     for t in tokens:
 855                         if t in name_tokens:
 856                             if not theme in themes_hit:
 857                                 themes_hit.append(theme)
 858                             break
 859
 860             m = {'score': f[SCORE],
 861                  'fragment': frag,
 862                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 863                  'themes': themes,
 864                  'themes_hit': themes_hit
 865                  }
 866             m.update(f[OTHER])
 867             hits.append(m)
 868
 869         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 870
 871         self._processed_hits = hits
 872
 873         return hits
 874
 875     def __unicode__(self):
 876         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 877
 878     @staticmethod
 879     def aggregate(*result_lists):
 880         books = {}
 881         for rl in result_lists:
 882             for r in rl:
 883                 if r.book_id in books:
 884                     books[r.book_id].merge(r)
 885                 else:
 886                     books[r.book_id] = r
 887         return books.values()
 888
 889     def __cmp__(self, other):
 890         c = cmp(self.score, other.score)
 891         if c == 0:
 892             # this is inverted, because earlier date is better
 893             return cmp(other.published_date, self.published_date)
 894         else:
 895             return c
 896
 897
 898 class Hint(object):
 899     """
 900     Given some hint information (information we already know about)
 901     our search target - like author, title (specific book), epoch, genre, kind
 902     we can narrow down search using filters.
 903     """
 904     def __init__(self, search):
 905         """
 906         Accepts a Searcher instance.
 907         """
 908         self.search = search
 909         self.book_tags = {}
 910         self.part_tags = []
 911         self._books = []
 912
 913     def books(self, *books):
 914         """
 915         Give a hint that we search these books.
 916         """
 917         self._books = books
 918
 919     def tags(self, tags):
 920         """
 921         Give a hint that these Tag objects (a list of)
 922         is necessary.
 923         """
 924         for t in tags:
 925             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 926                 lst = self.book_tags.get(t.category, [])
 927                 lst.append(t)
 928                 self.book_tags[t.category] = lst
 929             if t.category in ['theme', 'theme_pl']:
 930                 self.part_tags.append(t)
 931
 932     def tag_filter(self, tags, field='tags'):
 933         """
 934         Given a lsit of tags and an optional field (but they are normally in tags field)
 935         returns a filter accepting only books with specific tags.
 936         """
 937         q = BooleanQuery()
 938
 939         for tag in tags:
 940             toks = self.search.get_tokens(tag.name, field=field)
 941             tag_phrase = PhraseQuery()
 942             for tok in toks:
 943                 tag_phrase.add(Term(field, tok))
 944             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 945
 946         return QueryWrapperFilter(q)
 947
 948     def book_filter(self):
 949         """
 950         Filters using book tags (all tag kinds except a theme)
 951         """
 952         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 953         if tags:
 954             return self.tag_filter(tags)
 955         else:
 956             return None
 957
 958     def part_filter(self):
 959         """
 960         This filter can be used to look for book parts.
 961         It filters on book id and/or themes.
 962         """
 963         fs = []
 964         if self.part_tags:
 965             fs.append(self.tag_filter(self.part_tags, field='themes'))
 966
 967         if self._books != []:
 968             bf = BooleanFilter()
 969             for b in self._books:
 970                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 971                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 972             fs.append(bf)
 973
 974         return Search.chain_filters(fs)
 975
 976     def should_search_for_book(self):
 977         return self._books == []
 978
 979     def just_search_in(self, all):
 980         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 981         some = []
 982         for field in all:
 983             if field == 'authors' and 'author' in self.book_tags:
 984                 continue
 985             if field == 'title' and self._books != []:
 986                 continue
 987             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 988                 continue
 989             some.append(field)
 990         return some
 991
 992
 993 class Search(IndexStore):
 994     """
 995     Search facilities.
 996     """
 997     def __init__(self, default_field="content"):
 998         IndexStore.__init__(self)
 999         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
1000         # self.analyzer = WLAnalyzer()
1001         reader = IndexReader.open(self.store, True)
1002         self.searcher = IndexSearcher(reader)
1003         self.parser = QueryParser(Version.LUCENE_34, default_field,
1004                                   self.analyzer)
1005
1006         self.parent_filter = TermsFilter()
1007         self.parent_filter.addTerm(Term("is_book", "true"))
1008         index_changed.connect(self.reopen)
1009
1010     def close(self):
1011         reader = self.searcher.getIndexReader()
1012         self.searcher.close()
1013         reader.close()
1014         super(Search, self).close()
1015         index_changed.disconnect(self.reopen)
1016
1017     def reopen(self, **unused):
1018         reader = self.searcher.getIndexReader()
1019         rdr = reader.reopen()
1020         if not rdr.equals(reader):
1021             log.debug('Reopening index')
1022             oldsearch = self.searcher
1023             self.searcher = IndexSearcher(rdr)
1024             oldsearch.close()
1025             reader.close()
1026
1027     def query(self, query):
1028         """Parse query in default Lucene Syntax. (for humans)
1029         """
1030         return self.parser.parse(query)
1031
1032     def simple_search(self, query, max_results=50):
1033         """Runs a query for books using lucene syntax. (for humans)
1034         Returns (books, total_hits)
1035         """
1036
1037         tops = self.searcher.search(self.query(query), max_results)
1038         bks = []
1039         for found in tops.scoreDocs:
1040             doc = self.searcher.doc(found.doc)
1041             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1042         return (bks, tops.totalHits)
1043
1044     def get_tokens(self, searched, field='content', cached=None):
1045         """returns tokens analyzed by a proper (for a field) analyzer
1046         argument can be: StringReader, string/unicode, or tokens. In the last case
1047         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1048         """
1049         if cached is not None and field in cached:
1050             return cached[field]
1051
1052         if isinstance(searched, str) or isinstance(searched, unicode):
1053             searched = StringReader(searched)
1054         elif isinstance(searched, list):
1055             return searched
1056
1057         searched.reset()
1058         tokens = self.analyzer.reusableTokenStream(field, searched)
1059         toks = []
1060         while tokens.incrementToken():
1061             cta = tokens.getAttribute(CharTermAttribute.class_)
1062             toks.append(cta.toString())
1063
1064         if cached is not None:
1065             cached[field] = toks
1066
1067         return toks
1068
1069     @staticmethod
1070     def fuzziness(fuzzy):
1071         """Helper method to sanitize fuzziness"""
1072         if not fuzzy:
1073             return None
1074         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1075             return fuzzy
1076         else:
1077             return 0.5
1078
1079     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1080         """
1081         Return a PhraseQuery with a series of tokens.
1082         """
1083         if fuzzy:
1084             phrase = MultiPhraseQuery()
1085             for t in tokens:
1086                 term = Term(field, t)
1087                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1088                 fuzzterms = []
1089
1090                 while True:
1091                     ft = fuzzterm.term()
1092                     if ft:
1093                         fuzzterms.append(ft)
1094                     if not fuzzterm.next(): break
1095                 if fuzzterms:
1096                     phrase.add(JArray('object')(fuzzterms, Term))
1097                 else:
1098                     phrase.add(term)
1099         else:
1100             phrase = PhraseQuery()
1101             phrase.setSlop(slop)
1102             for t in tokens:
1103                 term = Term(field, t)
1104                 phrase.add(term)
1105         return phrase
1106
1107     @staticmethod
1108     def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1109         """
1110         Returns term queries joined by boolean query.
1111         modal - applies to boolean query
1112         fuzzy - should the query by fuzzy.
1113         """
1114         q = BooleanQuery()
1115         for t in tokens:
1116             term = Term(field, t)
1117             if fuzzy:
1118                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1119             else:
1120                 term = TermQuery(term)
1121             q.add(BooleanClause(term, modal))
1122         return q
1123
1124     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1125                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1126         if filters is None: filters = []
1127         if tokens_cache is None: tokens_cache = {}
1128
1129         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1130
1131         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1132         if book:
1133             filters.append(self.term_filter(Term('is_book', 'true')))
1134         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1135
1136         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1137
1138     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1139                     filters=None, tokens_cache=None, boost=None, snippets=True):
1140         if filters is None: filters = []
1141         if tokens_cache is None: tokens_cache = {}
1142
1143         if book:
1144             filters.append(self.term_filter(Term('is_book', 'true')))
1145
1146         query = BooleanQuery()
1147
1148         for fld in fields:
1149             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1150
1151             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1152                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1153
1154         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1155
1156         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1157                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1158
1159     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1160         """
1161         Search for perfect book matches. Just see if the query matches with some author or title,
1162         taking hints into account.
1163         """
1164         fields_to_search = ['authors', 'title']
1165         only_in = None
1166         if hint:
1167             if not hint.should_search_for_book():
1168                 return []
1169             fields_to_search = hint.just_search_in(fields_to_search)
1170             only_in = hint.book_filter()
1171
1172         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1173
1174         books = []
1175         for q in qrys:
1176             top = self.searcher.search(q,
1177                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1178                 max_results)
1179             for found in top.scoreDocs:
1180                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1181         return books
1182
1183     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1184         fields_to_search = ['tags', 'authors', 'title']
1185
1186         only_in = None
1187         if hint:
1188             if not hint.should_search_for_book():
1189                 return []
1190             fields_to_search = hint.just_search_in(fields_to_search)
1191             only_in = hint.book_filter()
1192
1193         tokens = self.get_tokens(searched, field='SIMPLE')
1194
1195         q = BooleanQuery()
1196
1197         for fld in fields_to_search:
1198             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1199                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1200
1201         books = []
1202         top = self.searcher.search(q,
1203                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1204             max_results)
1205         for found in top.scoreDocs:
1206             books.append(SearchResult(self, found, how_found="search_book"))
1207
1208         return books
1209
1210     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1211         """
1212         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1213         some part/fragment of the book.
1214         """
1215         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1216
1217         flt = None
1218         if hint:
1219             flt = hint.part_filter()
1220
1221         books = []
1222         for q in qrys:
1223             top = self.searcher.search(q,
1224                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1225                                                            flt]),
1226                                        max_results)
1227             for found in top.scoreDocs:
1228                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1229
1230         return books
1231
1232     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1233         """
1234         Tries to use search terms to match different fields of book (or its parts).
1235         E.g. one word can be an author survey, another be a part of the title, and the rest
1236         are some words from third chapter.
1237         """
1238         if tokens_cache is None: tokens_cache = {}
1239         books = []
1240         only_in = None
1241
1242         if hint:
1243             only_in = hint.part_filter()
1244
1245         # content only query : themes x content
1246         q = BooleanQuery()
1247
1248         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1249         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1250
1251         # only search in themes when we do not already filter by themes
1252         if hint is None or hint.just_search_in(['themes']) != []:
1253             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1254                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1255
1256         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1257                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1258
1259         topDocs = self.searcher.search(q, only_in, max_results)
1260         for found in topDocs.scoreDocs:
1261             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1262
1263         # query themes/content x author/title/tags
1264         q = BooleanQuery()
1265         in_content = BooleanQuery()
1266         in_meta = BooleanQuery()
1267
1268         for fld in ['themes_pl', 'content']:
1269             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1270
1271         for fld in ['tags', 'authors', 'title']:
1272             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1273
1274         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1275         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1276
1277         topDocs = self.searcher.search(q, only_in, max_results)
1278         for found in topDocs.scoreDocs:
1279             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1280
1281         return books
1282
1283     # def multisearch(self, query, max_results=50):
1284     #     """
1285     #     Search strategy:
1286     #     - (phrase) OR -> content
1287     #                   -> title
1288     #                   -> authors
1289     #     - (keywords)  -> authors
1290     #                   -> motyw
1291     #                   -> tags
1292     #                   -> content
1293     #     """
1294         # queryreader = StringReader(query)
1295         # tokens = self.get_tokens(queryreader)
1296
1297         # top_level = BooleanQuery()
1298         # Should = BooleanClause.Occur.SHOULD
1299
1300         # phrase_level = BooleanQuery()
1301         # phrase_level.setBoost(1.3)
1302
1303         # p_content = self.make_phrase(tokens, joined=True)
1304         # p_title = self.make_phrase(tokens, 'title')
1305         # p_author = self.make_phrase(tokens, 'author')
1306
1307         # phrase_level.add(BooleanClause(p_content, Should))
1308         # phrase_level.add(BooleanClause(p_title, Should))
1309         # phrase_level.add(BooleanClause(p_author, Should))
1310
1311         # kw_level = BooleanQuery()
1312
1313         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1314         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1315         # kw_level.add(j_themes, Should)
1316         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1317         # j_con = self.make_term_query(tokens, joined=True)
1318         # kw_level.add(j_con, Should)
1319
1320         # top_level.add(BooleanClause(phrase_level, Should))
1321         # top_level.add(BooleanClause(kw_level, Should))
1322
1323         # return None
1324
1325     def get_snippets(self, scoreDoc, query, field='content'):
1326         """
1327         Returns a snippet for found scoreDoc.
1328         """
1329         htmlFormatter = SimpleHTMLFormatter()
1330         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1331
1332         stored = self.searcher.doc(scoreDoc.doc)
1333
1334         position = stored.get('snippets_position')
1335         length = stored.get('snippets_length')
1336         if position is None or length is None:
1337             return None
1338         revision = stored.get('snippets_revision')
1339         if revision: revision = int(revision)
1340
1341         # locate content.
1342         book_id = int(stored.get('book_id'))
1343         snippets = Snippets(book_id, revision=revision)
1344
1345         try:
1346             snippets.open()
1347         except IOError, e:
1348             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1349             return []
1350
1351         try:
1352             try:
1353                 text = snippets.get((int(position),
1354                                      int(length)))
1355             finally:
1356                 snippets.close()
1357
1358             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1359             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1360             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1361
1362         except Exception, e:
1363             e2 = e
1364             if hasattr(e, 'getJavaException'):
1365                 e2 = unicode(e.getJavaException())
1366             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1367                 e2)
1368         return snip
1369
1370     @staticmethod
1371     def enum_to_array(enum):
1372         """
1373         Converts a lucene TermEnum to array of Terms, suitable for
1374         addition to queries
1375         """
1376         terms = []
1377
1378         while True:
1379             t = enum.term()
1380             if t:
1381                 terms.append(t)
1382             if not enum.next(): break
1383
1384         if terms:
1385             return JArray('object')(terms, Term)
1386
1387     def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1388         """
1389         Search for Tag objects using query.
1390         """
1391         if not pdcounter:
1392             filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1393         tops = self.searcher.search(query, filt, max_results)
1394
1395         tags = []
1396         for found in tops.scoreDocs:
1397             doc = self.searcher.doc(found.doc)
1398             is_pdcounter = doc.get('is_pdcounter')
1399             category = doc.get('tag_category')
1400             try:
1401                 if is_pdcounter == 'true':
1402                     if category == 'pd_author':
1403                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1404                     elif category == 'pd_book':
1405                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1406                         tag.category = 'pd_book'  # make it look more lik a tag.
1407                     else:
1408                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1409                 else:
1410                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1411                     # don't add the pdcounter tag if same tag already exists
1412                 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1413                     tags.append(tag)
1414             except catalogue.models.Tag.DoesNotExist: pass
1415             except PDCounterAuthor.DoesNotExist: pass
1416             except PDCounterBook.DoesNotExist: pass
1417
1418         log.debug('search_tags: %s' % tags)
1419
1420         return tags
1421
1422     def search_books(self, query, filt=None, max_results=10):
1423         """
1424         Searches for Book objects using query
1425         """
1426         bks = []
1427         tops = self.searcher.search(query, filt, max_results)
1428         for found in tops.scoreDocs:
1429             doc = self.searcher.doc(found.doc)
1430             try:
1431                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1432             except catalogue.models.Book.DoesNotExist: pass
1433         return bks
1434
1435     def make_prefix_phrase(self, toks, field):
1436         q = MultiPhraseQuery()
1437         for i in range(len(toks)):
1438             t = Term(field, toks[i])
1439             if i == len(toks) - 1:
1440                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1441                 if pterms:
1442                     q.add(pterms)
1443                 else:
1444                     q.add(t)
1445             else:
1446                 q.add(t)
1447         return q
1448
1449     @staticmethod
1450     def term_filter(term, inverse=False):
1451         only_term = TermsFilter()
1452         only_term.addTerm(term)
1453
1454         if inverse:
1455             neg = BooleanFilter()
1456             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1457             only_term = neg
1458
1459         return only_term
1460
1461     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1462         """
1463         Return auto-complete hints for tags
1464         using prefix search.
1465         """
1466         toks = self.get_tokens(string, field='SIMPLE')
1467         top = BooleanQuery()
1468
1469         for field in ['tag_name', 'tag_name_pl']:
1470             if prefix:
1471                 q = self.make_prefix_phrase(toks, field)
1472             else:
1473                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1474             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1475
1476         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1477
1478         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1479
1480     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1481         """
1482         Returns auto-complete hints for book titles
1483         Because we do not index 'pseudo' title-tags.
1484         Prefix search.
1485         """
1486         toks = self.get_tokens(string, field='SIMPLE')
1487
1488         if prefix:
1489             q = self.make_prefix_phrase(toks, 'title')
1490         else:
1491             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1492
1493         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1494
1495     @staticmethod
1496     def chain_filters(filters, op=ChainedFilter.AND):
1497         """
1498         Chains a filter list together
1499         """
1500         filters = filter(lambda x: x is not None, filters)
1501         if not filters or filters is []:
1502             return None
1503         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1504         return chf
1505
1506     def filtered_categories(self, tags):
1507         """
1508         Return a list of tag categories, present in tags list.
1509         """
1510         cats = {}
1511         for t in tags:
1512             cats[t.category] = True
1513         return cats.keys()
1514
1515     def hint(self):
1516         return Hint(self)