1 # -*- coding: utf-8 -*-
 
   3 from django.conf import settings
 
   4 from django.dispatch import Signal
 
   5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
 
   6     File, Field, Integer, \
 
   7     NumericField, Version, Document, JavaError, IndexSearcher, \
 
   8     QueryParser, PerFieldAnalyzerWrapper, \
 
   9     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
 
  10     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
 
  11     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
 
  12     HashSet, BooleanClause, Term, CharTermAttribute, \
 
  13     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
 
  14     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
 
  15     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
 
  16     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
 
  17     initVM, CLASSPATH, JArray, JavaError
 
  21 JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP)
 
  27 from librarian import dcparser
 
  28 from librarian.parser import WLDocument
 
  29 from lxml import etree
 
  30 import catalogue.models
 
  31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 
  32 from multiprocessing.pool import ThreadPool
 
  33 from threading import current_thread
 
  34 from itertools import chain
 
  38 log = logging.getLogger('search')
 
  40 class WLAnalyzer(PerFieldAnalyzerWrapper):
 
  42         polish = PolishAnalyzer(Version.LUCENE_34)
 
  43         #        polish_gap.setPositionIncrementGap(999)
 
  45         simple = SimpleAnalyzer(Version.LUCENE_34)
 
  46         #        simple_gap.setPositionIncrementGap(999)
 
  48         keyword = KeywordAnalyzer(Version.LUCENE_34)
 
  50         # not sure if needed: there's NOT_ANALYZED meaning basically the same
 
  52         PerFieldAnalyzerWrapper.__init__(self, polish)
 
  54         self.addAnalyzer("tags", simple)
 
  55         self.addAnalyzer("technical_editors", simple)
 
  56         self.addAnalyzer("editors", simple)
 
  57         self.addAnalyzer("url", keyword)
 
  58         self.addAnalyzer("source_url", keyword)
 
  59         self.addAnalyzer("source_name", simple)
 
  60         self.addAnalyzer("publisher", simple)
 
  61         self.addAnalyzer("authors", simple)
 
  62         self.addAnalyzer("title", simple)
 
  64         self.addAnalyzer("is_book", keyword)
 
  65         # shouldn't the title have two forms? _pl and simple?
 
  67         self.addAnalyzer("themes", simple)
 
  68         self.addAnalyzer("themes_pl", polish)
 
  70         self.addAnalyzer("tag_name", simple)
 
  71         self.addAnalyzer("tag_name_pl", polish)
 
  73         self.addAnalyzer("translators", simple)
 
  75         self.addAnalyzer("KEYWORD", keyword)
 
  76         self.addAnalyzer("SIMPLE", simple)
 
  77         self.addAnalyzer("POLISH", polish)
 
  80 class IndexStore(object):
 
  82     Provides access to search index.
 
  84     self.store - lucene index directory
 
  88         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
 
  90     def make_index_dir(self):
 
  92             os.makedirs(settings.SEARCH_INDEX)
 
  93         except OSError as exc:
 
  94             if exc.errno == errno.EEXIST:
 
 102 class IndexChecker(IndexStore):
 
 104         IndexStore.__init__(self)
 
 107         checker = CheckIndex(self.store)
 
 108         status = checker.checkIndex()
 
 112 class Snippets(object):
 
 114     This class manages snippet files for indexed object (book)
 
 115     the snippets are concatenated together, and their positions and
 
 116     lengths are kept in lucene index fields.
 
 118     SNIPPET_DIR = "snippets"
 
 120     def __init__(self, book_id, revision=None):
 
 122             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 
 123         except OSError as exc:
 
 124             if exc.errno == errno.EEXIST:
 
 127         self.book_id = book_id
 
 128         self.revision = revision
 
 133         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
 
 134         else: fn = "%d" % self.book_id
 
 136         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 
 138     def open(self, mode='r'):
 
 140         Open the snippet file. Call .close() afterwards.
 
 146             if os.path.exists(self.path):
 
 149                     if not os.path.exists(self.path):
 
 153         self.file = open(self.path, mode)
 
 157     def add(self, snippet):
 
 159         Append a snippet (unicode) to the snippet file.
 
 160         Return a (position, length) tuple
 
 162         txt = snippet.encode('utf-8')
 
 165         pos = (self.position, l)
 
 171         Given a tuple of (position, length) return an unicode
 
 172         of the snippet stored there.
 
 174         self.file.seek(pos[0], 0)
 
 175         txt = self.file.read(pos[1]).decode('utf-8')
 
 179         """Close snippet file"""
 
 194 class BaseIndex(IndexStore):
 
 197     Provides basic operations on index: opening, closing, optimizing.
 
 199     def __init__(self, analyzer=None):
 
 200         super(BaseIndex, self).__init__()
 
 203             analyzer = WLAnalyzer()
 
 204         self.analyzer = analyzer
 
 206     def open(self, timeout=None):
 
 208             raise Exception("Index is already opened")
 
 209         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 
 211             conf.setWriteLockTimeout(long(timeout))
 
 212         self.index = IndexWriter(self.store, conf)
 
 216         self.index.optimize()
 
 220             self.index.optimize()
 
 221         except JavaError, je:
 
 222             log.error("Error during optimize phase, check index: %s" % je)
 
 227         index_changed.send_robust(self)
 
 229         super(BaseIndex, self).close()
 
 235     def __exit__(self, type, value, tb):
 
 239 index_changed = Signal()
 
 242 class Index(BaseIndex):
 
 244     Class indexing books.
 
 246     def __init__(self, analyzer=None):
 
 247         super(Index, self).__init__(analyzer)
 
 249     def index_tags(self, *tags, **kw):
 
 251         Re-index global tag list.
 
 252         Removes all tags from index, then index them again.
 
 253         Indexed fields include: id, name (with and without polish stems), category
 
 255         remove_only = kw.get('remove_only', False)
 
 256         # first, remove tags from index.
 
 260                 b_id_cat = BooleanQuery()
 
 262                 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
 
 263                 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
 
 265                 if isinstance(tag, PDCounterAuthor):
 
 266                     q_cat = TermQuery(Term('tag_category', 'pd_author'))
 
 267                 elif isinstance(tag, PDCounterBook):
 
 268                     q_cat = TermQuery(Term('tag_category', 'pd_book'))
 
 270                     q_cat = TermQuery(Term('tag_category', tag.category))
 
 271                 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
 
 273                 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
 
 275             q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 
 276             self.index.deleteDocuments(q)
 
 279             # then add them [all or just one passed]
 
 281                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 
 282                     PDCounterAuthor.objects.all(), \
 
 283                     PDCounterBook.objects.all())
 
 286                 if isinstance(tag, PDCounterAuthor):
 
 288                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 
 289                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 290                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 291                     doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 292                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 293                     self.index.addDocument(doc)
 
 294                 elif isinstance(tag, PDCounterBook):
 
 296                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 
 297                     doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 
 298                     doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
 
 299                     doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 300                     doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 301                     self.index.addDocument(doc)
 
 304                     doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 
 305                     doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 306                     doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 307                     doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 308                     self.index.addDocument(doc)
 
 310     def create_book_doc(self, book):
 
 312         Create a lucene document referring book id.
 
 315         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 
 316         if book.parent is not None:
 
 317             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 
 320     def remove_book(self, book_or_id, remove_snippets=True):
 
 321         """Removes a book from search index.
 
 322         book - Book instance."""
 
 323         if isinstance(book_or_id, catalogue.models.Book):
 
 324             book_id = book_or_id.id
 
 328         q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
 
 329         self.index.deleteDocuments(q)
 
 332             snippets = Snippets(book_id)
 
 335     def index_book(self, book, book_info=None, overwrite=True):
 
 338         Creates a lucene document for extracted metadata
 
 339         and calls self.index_content() to index the contents of the book.
 
 342             # we don't remove snippets, since they might be still needed by
 
 343             # threads using not reopened index
 
 344             self.remove_book(book, remove_snippets=False)
 
 346         book_doc = self.create_book_doc(book)
 
 347         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 
 348         # let's not index it - it's only used for extracting publish date
 
 349         if 'source_name' in meta_fields:
 
 350             del meta_fields['source_name']
 
 352         for f in meta_fields.values():
 
 353             if isinstance(f, list) or isinstance(f, tuple):
 
 358         self.index.addDocument(book_doc)
 
 361         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 
 366         'dramat_wierszowany_l',
 
 367         'dramat_wierszowany_lp',
 
 368         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 
 372     ignore_content_tags = [
 
 374         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 
 376         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 
 379     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 
 381     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 
 383     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 
 385     def extract_metadata(self, book, book_info=None, dc_only=None):
 
 387         Extract metadata from book and returns a map of fields keyed by fieldname
 
 391         if book_info is None:
 
 392             book_info = dcparser.parse(open(book.xml_file.path))
 
 394         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 
 395         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 
 396         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 399         for field in dcparser.BookInfo.FIELDS:
 
 400             if dc_only and field.name not in dc_only:
 
 402             if hasattr(book_info, field.name):
 
 403                 if not getattr(book_info, field.name):
 
 405                 # since no type information is available, we use validator
 
 406                 type_indicator = field.validator
 
 407                 if type_indicator == dcparser.as_unicode:
 
 408                     s = getattr(book_info, field.name)
 
 412                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 
 413                     except JavaError as je:
 
 414                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 
 415                 elif type_indicator == dcparser.as_person:
 
 416                     p = getattr(book_info, field.name)
 
 417                     if isinstance(p, dcparser.Person):
 
 420                         persons = ', '.join(map(unicode, p))
 
 421                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 
 422                 elif type_indicator == dcparser.as_date:
 
 423                     dt = getattr(book_info, field.name)
 
 424                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 
 425                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 429         if hasattr(book_info, 'source_name') and book_info.source_name:
 
 430             match = self.published_date_re.search(book_info.source_name)
 
 431             if match is not None:
 
 432                 pd = str(match.groups()[0])
 
 434         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 
 438     def add_gaps(self, fields, fieldname):
 
 440         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 
 441         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 
 445                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 446         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 
 448     def get_master(self, root):
 
 450         Returns the first master tag from an etree.
 
 452         for master in root.iter():
 
 453             if master.tag in self.master_tags:
 
 456     def index_content(self, book, book_fields=[]):
 
 458         Walks the book XML and extract content from it.
 
 459         Adds parts for each header tag and for each fragment.
 
 461         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 
 462         root = wld.edoc.getroot()
 
 464         master = self.get_master(root)
 
 468         def walker(node, ignore_tags=[]):
 
 470             if node.tag not in ignore_tags:
 
 471                 yield node, None, None
 
 472                 if node.text is not None:
 
 473                     yield None, node.text, None
 
 474                 for child in list(node):
 
 475                     for b, t, e in walker(child):
 
 477                 yield None, None, node
 
 479             if node.tail is not None:
 
 480                 yield None, node.tail, None
 
 483         def fix_format(text):
 
 484             #            separator = [u" ", u"\t", u".", u";", u","]
 
 485             if isinstance(text, list):
 
 486                 # need to join it first
 
 487                 text = filter(lambda s: s is not None, content)
 
 488                 text = u' '.join(text)
 
 489                 # for i in range(len(text)):
 
 491                 #         if text[i][0] not in separator\
 
 492                 #             and text[i - 1][-1] not in separator:
 
 493                 #          text.insert(i, u" ")
 
 495             return re.sub("(?m)/$", "", text)
 
 497         def add_part(snippets, **fields):
 
 498             doc = self.create_book_doc(book)
 
 499             for f in book_fields:
 
 502             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 
 503             doc.add(NumericField("header_span", Field.Store.YES, True)\
 
 504                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 
 505             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 507             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 
 508                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 
 510             snip_pos = snippets.add(fields["content"])
 
 511             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 
 512             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 
 513             if snippets.revision:
 
 514                 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
 
 516             if 'fragment_anchor' in fields:
 
 517                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 
 518                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 520             if 'themes' in fields:
 
 521                 themes, themes_pl = zip(*[
 
 522                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 
 523                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 
 524                      for theme in fields['themes']])
 
 526                 themes = self.add_gaps(themes, 'themes')
 
 527                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 
 537             if isinstance(s, unicode):
 
 538                 return s.encode('utf-8')
 
 543         snippets = Snippets(book.id).open('w')
 
 545             for header, position in zip(list(master), range(len(master))):
 
 547                 if header.tag in self.skip_header_tags:
 
 549                 if header.tag is etree.Comment:
 
 556                 def all_content(text):
 
 557                     for frag in fragments.values():
 
 558                         frag['content'].append(text)
 
 560                 handle_text = [all_content]
 
 563                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 
 565                     if start is not None and start.tag in self.footnote_tags:
 
 567                         def collect_footnote(t):
 
 569                         handle_text.append(collect_footnote)
 
 570                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 
 572                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 
 573                                        content=u''.join(footnote),
 
 574                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 576                         self.index.addDocument(doc)
 
 577                         #print "@ footnote text: %s" % footnote
 
 580                     # handle fragments and themes.
 
 581                     if start is not None and start.tag == 'begin':
 
 582                         fid = start.attrib['id'][1:]
 
 583                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 
 585                     # themes for this fragment
 
 586                     elif start is not None and start.tag == 'motyw':
 
 587                         fid = start.attrib['id'][1:]
 
 588                         handle_text.append(None)
 
 589                         if start.text is not None:
 
 590                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 
 591                     elif end is not None and end.tag == 'motyw':
 
 594                     elif start is not None and start.tag == 'end':
 
 595                         fid = start.attrib['id'][1:]
 
 596                         if fid not in fragments:
 
 597                             continue  # a broken <end> node, skip it
 
 598                         frag = fragments[fid]
 
 599                         if frag['themes'] == []:
 
 600                             continue  # empty themes list.
 
 603                         doc = add_part(snippets,
 
 604                                        header_type=frag['start_header'],
 
 605                                        header_index=frag['start_section'],
 
 606                                        header_span=position - frag['start_section'] + 1,
 
 608                                        content=fix_format(frag['content']),
 
 609                                        themes=frag['themes'])
 
 610                         #print '@ FRAG %s' % frag['content']
 
 611                         self.index.addDocument(doc)
 
 615                     if text is not None and handle_text is not []:
 
 616                         hdl = handle_text[-1]
 
 620                         # in the end, add a section text.
 
 621                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 
 622                                content=fix_format(content))
 
 623                 #print '@ CONTENT: %s' % fix_format(content)
 
 625                 self.index.addDocument(doc)
 
 631 def log_exception_wrapper(f):
 
 636             log.error("Error in indexing thread: %s" % e)
 
 637             traceback.print_exc()
 
 642 class ReusableIndex(Index):
 
 644     Works like index, but does not close/optimize Lucene index
 
 645     until program exit (uses atexit hook).
 
 646     This is usefull for importbooks command.
 
 648     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 
 652     def open(self, analyzer=None, **kw):
 
 653         if ReusableIndex.index:
 
 654             self.index = ReusableIndex.index
 
 656             Index.open(self, analyzer, **kw)
 
 657             ReusableIndex.index = self.index
 
 658             atexit.register(ReusableIndex.close_reusable)
 
 660     # def index_book(self, *args, **kw):
 
 661     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 
 662     #     ReusableIndex.pool_jobs.append(job)
 
 665     def close_reusable():
 
 666         if ReusableIndex.index:
 
 667             ReusableIndex.index.optimize()
 
 668             ReusableIndex.index.close()
 
 669             ReusableIndex.index = None
 
 671             index_changed.send_robust(None)
 
 674         if ReusableIndex.index:
 
 675             ReusableIndex.index.commit()
 
 678 class JoinSearch(object):
 
 680     This mixin could be used to handle block join queries.
 
 683     def __init__(self, *args, **kw):
 
 684         super(JoinSearch, self).__init__(*args, **kw)
 
 686     def wrapjoins(self, query, fields=[]):
 
 688         This functions modifies the query in a recursive way,
 
 689         so Term and Phrase Queries contained, which match
 
 690         provided fields are wrapped in a BlockJoinQuery,
 
 691         and so delegated to children documents.
 
 693         if BooleanQuery.instance_(query):
 
 694             qs = BooleanQuery.cast_(query)
 
 696                 clause = BooleanClause.cast_(clause)
 
 697                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 
 701             query.extractTerms(termset)
 
 704                 if t.field() not in fields:
 
 706             return BlockJoinQuery(query, self.parent_filter,
 
 707                                   BlockJoinQuery.ScoreMode.Total)
 
 709     def bsearch(self, query, max_results=50):
 
 710         q = self.query(query)
 
 711         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 
 713         tops = self.searcher.search(bjq, max_results)
 
 715         for found in tops.scoreDocs:
 
 716             doc = self.searcher.doc(found.doc)
 
 717             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 718         return (bks, tops.totalHits)
 
 721 class SearchResult(object):
 
 722     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 
 723         if tokens_cache is None: tokens_cache = {}
 
 728             self._score = scoreDocs.score
 
 733         self._processed_hits = None  # processed hits
 
 735         stored = search.searcher.doc(scoreDocs.doc)
 
 736         self.book_id = int(stored.get("book_id"))
 
 738         pd = stored.get("published_date")
 
 740             self.published_date = int(pd)
 
 742             self.published_date = 0
 
 744         header_type = stored.get("header_type")
 
 745         # we have a content hit in some header of fragment
 
 746         if header_type is not None:
 
 747             sec = (header_type, int(stored.get("header_index")))
 
 748             header_span = stored.get('header_span')
 
 749             header_span = header_span is not None and int(header_span) or 1
 
 751             fragment = stored.get("fragment_anchor")
 
 754                 snippets = snippets.replace("/\n", "\n")
 
 755             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 
 757             self._hits.append(hit)
 
 760         self.searched = searched
 
 761         self.tokens_cache = tokens_cache
 
 765         return self._score * self.boost
 
 767     def merge(self, other):
 
 768         if self.book_id != other.book_id:
 
 769             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 
 770         self._hits += other._hits
 
 771         if other.score > self.score:
 
 772             self._score = other._score
 
 776         if hasattr(self, '_book'):
 
 778         return catalogue.models.Book.objects.get(id=self.book_id)
 
 780     book = property(get_book)
 
 784         if self._processed_hits is not None:
 
 785             return self._processed_hits
 
 794         # to sections and fragments
 
 795         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 
 797         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 
 799         # sections not covered by fragments
 
 800         sect = filter(lambda s: 0 == len(filter(
 
 801             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 
 802             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 
 807         def remove_duplicates(lst, keyfn, compare):
 
 812                     if compare(els[eif], e) >= 1:
 
 817         # remove fragments with duplicated fid's and duplicated snippets
 
 818         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
 
 819         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
 
 820                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
 
 822         # remove duplicate sections
 
 826             si = s[POSITION][POSITION_INDEX]
 
 829                 if sections[si]['score'] >= s[SCORE]:
 
 832             m = {'score': s[SCORE],
 
 833                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 
 838         hits = sections.values()
 
 842                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 
 843             except catalogue.models.Fragment.DoesNotExist:
 
 847             # Figure out if we were searching for a token matching some word in theme name.
 
 848             themes = frag.tags.filter(category='theme')
 
 850             if self.searched is not None:
 
 851                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 
 853                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 
 856                             if not theme in themes_hit:
 
 857                                 themes_hit.append(theme)
 
 860             m = {'score': f[SCORE],
 
 862                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 
 864                  'themes_hit': themes_hit
 
 869         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 
 871         self._processed_hits = hits
 
 875     def __unicode__(self):
 
 876         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 
 879     def aggregate(*result_lists):
 
 881         for rl in result_lists:
 
 883                 if r.book_id in books:
 
 884                     books[r.book_id].merge(r)
 
 887         return books.values()
 
 889     def __cmp__(self, other):
 
 890         c = cmp(self.score, other.score)
 
 892             # this is inverted, because earlier date is better
 
 893             return cmp(other.published_date, self.published_date)
 
 900     Given some hint information (information we already know about)
 
 901     our search target - like author, title (specific book), epoch, genre, kind
 
 902     we can narrow down search using filters.
 
 904     def __init__(self, search):
 
 906         Accepts a Searcher instance.
 
 913     def books(self, *books):
 
 915         Give a hint that we search these books.
 
 919     def tags(self, tags):
 
 921         Give a hint that these Tag objects (a list of)
 
 925             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 
 926                 lst = self.book_tags.get(t.category, [])
 
 928                 self.book_tags[t.category] = lst
 
 929             if t.category in ['theme', 'theme_pl']:
 
 930                 self.part_tags.append(t)
 
 932     def tag_filter(self, tags, field='tags'):
 
 934         Given a lsit of tags and an optional field (but they are normally in tags field)
 
 935         returns a filter accepting only books with specific tags.
 
 940             toks = self.search.get_tokens(tag.name, field=field)
 
 941             tag_phrase = PhraseQuery()
 
 943                 tag_phrase.add(Term(field, tok))
 
 944             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 
 946         return QueryWrapperFilter(q)
 
 948     def book_filter(self):
 
 950         Filters using book tags (all tag kinds except a theme)
 
 952         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 
 954             return self.tag_filter(tags)
 
 958     def part_filter(self):
 
 960         This filter can be used to look for book parts.
 
 961         It filters on book id and/or themes.
 
 965             fs.append(self.tag_filter(self.part_tags, field='themes'))
 
 967         if self._books != []:
 
 969             for b in self._books:
 
 970                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 
 971                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 
 974         return Search.chain_filters(fs)
 
 976     def should_search_for_book(self):
 
 977         return self._books == []
 
 979     def just_search_in(self, all):
 
 980         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 
 983             if field == 'authors' and 'author' in self.book_tags:
 
 985             if field == 'title' and self._books != []:
 
 987             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 
 993 class Search(IndexStore):
 
 997     def __init__(self, default_field="content"):
 
 998         IndexStore.__init__(self)
 
 999         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 
1000         # self.analyzer = WLAnalyzer()
 
1001         reader = IndexReader.open(self.store, True)
 
1002         self.searcher = IndexSearcher(reader)
 
1003         self.parser = QueryParser(Version.LUCENE_34, default_field,
 
1006         self.parent_filter = TermsFilter()
 
1007         self.parent_filter.addTerm(Term("is_book", "true"))
 
1008         index_changed.connect(self.reopen)
 
1011         reader = self.searcher.getIndexReader()
 
1012         self.searcher.close()
 
1014         super(Search, self).close()
 
1015         index_changed.disconnect(self.reopen)
 
1017     def reopen(self, **unused):
 
1018         reader = self.searcher.getIndexReader()
 
1019         rdr = reader.reopen()
 
1020         if not rdr.equals(reader):
 
1021             log.debug('Reopening index')
 
1022             oldsearch = self.searcher
 
1023             self.searcher = IndexSearcher(rdr)
 
1027     def query(self, query):
 
1028         """Parse query in default Lucene Syntax. (for humans)
 
1030         return self.parser.parse(query)
 
1032     def simple_search(self, query, max_results=50):
 
1033         """Runs a query for books using lucene syntax. (for humans)
 
1034         Returns (books, total_hits)
 
1037         tops = self.searcher.search(self.query(query), max_results)
 
1039         for found in tops.scoreDocs:
 
1040             doc = self.searcher.doc(found.doc)
 
1041             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
1042         return (bks, tops.totalHits)
 
1044     def get_tokens(self, searched, field='content', cached=None):
 
1045         """returns tokens analyzed by a proper (for a field) analyzer
 
1046         argument can be: StringReader, string/unicode, or tokens. In the last case
 
1047         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 
1049         if cached is not None and field in cached:
 
1050             return cached[field]
 
1052         if isinstance(searched, str) or isinstance(searched, unicode):
 
1053             searched = StringReader(searched)
 
1054         elif isinstance(searched, list):
 
1058         tokens = self.analyzer.reusableTokenStream(field, searched)
 
1060         while tokens.incrementToken():
 
1061             cta = tokens.getAttribute(CharTermAttribute.class_)
 
1062             toks.append(cta.toString())
 
1064         if cached is not None:
 
1065             cached[field] = toks
 
1070     def fuzziness(fuzzy):
 
1071         """Helper method to sanitize fuzziness"""
 
1074         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 
1079     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 
1081         Return a PhraseQuery with a series of tokens.
 
1084             phrase = MultiPhraseQuery()
 
1086                 term = Term(field, t)
 
1087                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 
1091                     ft = fuzzterm.term()
 
1093                         fuzzterms.append(ft)
 
1094                     if not fuzzterm.next(): break
 
1096                     phrase.add(JArray('object')(fuzzterms, Term))
 
1100             phrase = PhraseQuery()
 
1101             phrase.setSlop(slop)
 
1103                 term = Term(field, t)
 
1108     def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 
1110         Returns term queries joined by boolean query.
 
1111         modal - applies to boolean query
 
1112         fuzzy - should the query by fuzzy.
 
1116             term = Term(field, t)
 
1118                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 
1120                 term = TermQuery(term)
 
1121             q.add(BooleanClause(term, modal))
 
1124     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 
1125                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 
1126         if filters is None: filters = []
 
1127         if tokens_cache is None: tokens_cache = {}
 
1129         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 
1131         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 
1133             filters.append(self.term_filter(Term('is_book', 'true')))
 
1134         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
1136         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 
1138     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 
1139                     filters=None, tokens_cache=None, boost=None, snippets=True):
 
1140         if filters is None: filters = []
 
1141         if tokens_cache is None: tokens_cache = {}
 
1144             filters.append(self.term_filter(Term('is_book', 'true')))
 
1146         query = BooleanQuery()
 
1149             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 
1151             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 
1152                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1154         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
1156         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
 
1157                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
 
1159     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 
1161         Search for perfect book matches. Just see if the query matches with some author or title,
 
1162         taking hints into account.
 
1164         fields_to_search = ['authors', 'title']
 
1167             if not hint.should_search_for_book():
 
1169             fields_to_search = hint.just_search_in(fields_to_search)
 
1170             only_in = hint.book_filter()
 
1172         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 
1176             top = self.searcher.search(q,
 
1177                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 
1179             for found in top.scoreDocs:
 
1180                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
 
1183     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 
1184         fields_to_search = ['tags', 'authors', 'title']
 
1188             if not hint.should_search_for_book():
 
1190             fields_to_search = hint.just_search_in(fields_to_search)
 
1191             only_in = hint.book_filter()
 
1193         tokens = self.get_tokens(searched, field='SIMPLE')
 
1197         for fld in fields_to_search:
 
1198             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 
1199                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1202         top = self.searcher.search(q,
 
1203                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 
1205         for found in top.scoreDocs:
 
1206             books.append(SearchResult(self, found, how_found="search_book"))
 
1210     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 
1212         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 
1213         some part/fragment of the book.
 
1215         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 
1219             flt = hint.part_filter()
 
1223             top = self.searcher.search(q,
 
1224                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 
1227             for found in top.scoreDocs:
 
1228                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 
1232     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
 
1234         Tries to use search terms to match different fields of book (or its parts).
 
1235         E.g. one word can be an author survey, another be a part of the title, and the rest
 
1236         are some words from third chapter.
 
1238         if tokens_cache is None: tokens_cache = {}
 
1243             only_in = hint.part_filter()
 
1245         # content only query : themes x content
 
1248         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
 
1249         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
 
1251         # only search in themes when we do not already filter by themes
 
1252         if hint is None or hint.just_search_in(['themes']) != []:
 
1253             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 
1254                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 
1256         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 
1257                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1259         topDocs = self.searcher.search(q, only_in, max_results)
 
1260         for found in topDocs.scoreDocs:
 
1261             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
 
1263         # query themes/content x author/title/tags
 
1265         in_content = BooleanQuery()
 
1266         in_meta = BooleanQuery()
 
1268         for fld in ['themes_pl', 'content']:
 
1269             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 
1271         for fld in ['tags', 'authors', 'title']:
 
1272             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 
1274         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 
1275         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 
1277         topDocs = self.searcher.search(q, only_in, max_results)
 
1278         for found in topDocs.scoreDocs:
 
1279             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
 
1283     # def multisearch(self, query, max_results=50):
 
1286     #     - (phrase) OR -> content
 
1289     #     - (keywords)  -> authors
 
1294         # queryreader = StringReader(query)
 
1295         # tokens = self.get_tokens(queryreader)
 
1297         # top_level = BooleanQuery()
 
1298         # Should = BooleanClause.Occur.SHOULD
 
1300         # phrase_level = BooleanQuery()
 
1301         # phrase_level.setBoost(1.3)
 
1303         # p_content = self.make_phrase(tokens, joined=True)
 
1304         # p_title = self.make_phrase(tokens, 'title')
 
1305         # p_author = self.make_phrase(tokens, 'author')
 
1307         # phrase_level.add(BooleanClause(p_content, Should))
 
1308         # phrase_level.add(BooleanClause(p_title, Should))
 
1309         # phrase_level.add(BooleanClause(p_author, Should))
 
1311         # kw_level = BooleanQuery()
 
1313         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
 
1314         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
 
1315         # kw_level.add(j_themes, Should)
 
1316         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 
1317         # j_con = self.make_term_query(tokens, joined=True)
 
1318         # kw_level.add(j_con, Should)
 
1320         # top_level.add(BooleanClause(phrase_level, Should))
 
1321         # top_level.add(BooleanClause(kw_level, Should))
 
1325     def get_snippets(self, scoreDoc, query, field='content'):
 
1327         Returns a snippet for found scoreDoc.
 
1329         htmlFormatter = SimpleHTMLFormatter()
 
1330         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
 
1332         stored = self.searcher.doc(scoreDoc.doc)
 
1334         position = stored.get('snippets_position')
 
1335         length = stored.get('snippets_length')
 
1336         if position is None or length is None:
 
1338         revision = stored.get('snippets_revision')
 
1339         if revision: revision = int(revision)
 
1342         book_id = int(stored.get('book_id'))
 
1343         snippets = Snippets(book_id, revision=revision)
 
1348             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
 
1353                 text = snippets.get((int(position),
 
1358             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
 
1359             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
 
1360             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 
1362         except Exception, e:
 
1364             if hasattr(e, 'getJavaException'):
 
1365                 e2 = unicode(e.getJavaException())
 
1366             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
 
1371     def enum_to_array(enum):
 
1373         Converts a lucene TermEnum to array of Terms, suitable for
 
1382             if not enum.next(): break
 
1385             return JArray('object')(terms, Term)
 
1387     def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
 
1389         Search for Tag objects using query.
 
1392             filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
 
1393         tops = self.searcher.search(query, filt, max_results)
 
1396         for found in tops.scoreDocs:
 
1397             doc = self.searcher.doc(found.doc)
 
1398             is_pdcounter = doc.get('is_pdcounter')
 
1399             category = doc.get('tag_category')
 
1401                 if is_pdcounter == 'true':
 
1402                     if category == 'pd_author':
 
1403                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 
1404                     elif category == 'pd_book':
 
1405                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 
1406                         tag.category = 'pd_book'  # make it look more lik a tag.
 
1408                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
 
1410                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 
1411                     # don't add the pdcounter tag if same tag already exists
 
1412                 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
 
1414             except catalogue.models.Tag.DoesNotExist: pass
 
1415             except PDCounterAuthor.DoesNotExist: pass
 
1416             except PDCounterBook.DoesNotExist: pass
 
1418         log.debug('search_tags: %s' % tags)
 
1422     def search_books(self, query, filt=None, max_results=10):
 
1424         Searches for Book objects using query
 
1427         tops = self.searcher.search(query, filt, max_results)
 
1428         for found in tops.scoreDocs:
 
1429             doc = self.searcher.doc(found.doc)
 
1431                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
1432             except catalogue.models.Book.DoesNotExist: pass
 
1435     def make_prefix_phrase(self, toks, field):
 
1436         q = MultiPhraseQuery()
 
1437         for i in range(len(toks)):
 
1438             t = Term(field, toks[i])
 
1439             if i == len(toks) - 1:
 
1440                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
 
1450     def term_filter(term, inverse=False):
 
1451         only_term = TermsFilter()
 
1452         only_term.addTerm(term)
 
1455             neg = BooleanFilter()
 
1456             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
 
1461     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
 
1463         Return auto-complete hints for tags
 
1464         using prefix search.
 
1466         toks = self.get_tokens(string, field='SIMPLE')
 
1467         top = BooleanQuery()
 
1469         for field in ['tag_name', 'tag_name_pl']:
 
1471                 q = self.make_prefix_phrase(toks, field)
 
1473                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
 
1474             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
 
1476         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
 
1478         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
 
1480     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
 
1482         Returns auto-complete hints for book titles
 
1483         Because we do not index 'pseudo' title-tags.
 
1486         toks = self.get_tokens(string, field='SIMPLE')
 
1489             q = self.make_prefix_phrase(toks, 'title')
 
1491             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
 
1493         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
 
1496     def chain_filters(filters, op=ChainedFilter.AND):
 
1498         Chains a filter list together
 
1500         filters = filter(lambda x: x is not None, filters)
 
1501         if not filters or filters is []:
 
1503         chf = ChainedFilter(JArray('object')(filters, Filter), op)
 
1506     def filtered_categories(self, tags):
 
1508         Return a list of tag categories, present in tags list.
 
1512             cats[t.category] = True