apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from multiprocessing.pool import ThreadPool
  31 from threading import current_thread
  32 import atexit
  33 import traceback
  34
  35
  36 class WLAnalyzer(PerFieldAnalyzerWrapper):
  37     def __init__(self):
  38         polish = PolishAnalyzer(Version.LUCENE_34)
  39         #        polish_gap.setPositionIncrementGap(999)
  40
  41         simple = SimpleAnalyzer(Version.LUCENE_34)
  42         #        simple_gap.setPositionIncrementGap(999)
  43
  44         keyword = KeywordAnalyzer(Version.LUCENE_34)
  45
  46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  47
  48         PerFieldAnalyzerWrapper.__init__(self, polish)
  49
  50         self.addAnalyzer("tags", simple)
  51         self.addAnalyzer("technical_editors", simple)
  52         self.addAnalyzer("editors", simple)
  53         self.addAnalyzer("url", keyword)
  54         self.addAnalyzer("source_url", keyword)
  55         self.addAnalyzer("source_name", simple)
  56         self.addAnalyzer("publisher", simple)
  57         self.addAnalyzer("authors", simple)
  58         self.addAnalyzer("title", simple)
  59
  60         self.addAnalyzer("is_book", keyword)
  61         # shouldn't the title have two forms? _pl and simple?
  62
  63         self.addAnalyzer("themes", simple)
  64         self.addAnalyzer("themes_pl", polish)
  65
  66         self.addAnalyzer("tag_name", simple)
  67         self.addAnalyzer("tag_name_pl", polish)
  68
  69         self.addAnalyzer("translators", simple)
  70
  71         self.addAnalyzer("KEYWORD", keyword)
  72         self.addAnalyzer("SIMPLE", simple)
  73         self.addAnalyzer("POLISH", polish)
  74
  75
  76 class IndexStore(object):
  77     """
  78     Provides access to search index.
  79
  80     self.store - lucene index directory
  81     """
  82     def __init__(self):
  83         self.make_index_dir()
  84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  85
  86     def make_index_dir(self):
  87         try:
  88             os.makedirs(settings.SEARCH_INDEX)
  89         except OSError as exc:
  90             if exc.errno == errno.EEXIST:
  91                 pass
  92             else: raise
  93
  94
  95 class IndexChecker(IndexStore):
  96     def __init__(self):
  97         IndexStore.__init__(self)
  98
  99     def check(self):
 100         checker = CheckIndex(self.store)
 101         status = checker.checkIndex()
 102         return status
 103
 104
 105 class Snippets(object):
 106     """
 107     This class manages snippet files for indexed object (book)
 108     the snippets are concatenated together, and their positions and
 109     lengths are kept in lucene index fields.
 110     """
 111     SNIPPET_DIR = "snippets"
 112
 113     def __init__(self, book_id):
 114         try:
 115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 116         except OSError as exc:
 117             if exc.errno == errno.EEXIST:
 118                 pass
 119             else: raise
 120         self.book_id = book_id
 121         self.file = None
 122
 123     def open(self, mode='r'):
 124         """
 125         Open the snippet file. Call .close() afterwards.
 126         """
 127         if not 'b' in mode:
 128             mode += 'b'
 129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 130         self.position = 0
 131         return self
 132
 133     def add(self, snippet):
 134         """
 135         Append a snippet (unicode) to the snippet file.
 136         Return a (position, length) tuple
 137         """
 138         txt = snippet.encode('utf-8')
 139         l = len(txt)
 140         self.file.write(txt)
 141         pos = (self.position, l)
 142         self.position += l
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad',
 268         ]
 269
 270     ignore_content_tags = [
 271         'uwaga', 'extra',
 272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 273         'didaskalia',
 274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 275         ]
 276
 277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 278
 279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 280
 281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 282
 283     def extract_metadata(self, book, book_info=None):
 284         """
 285         Extract metadata from book and returns a map of fields keyed by fieldname
 286         """
 287         fields = {}
 288
 289         if book_info is None:
 290             book_info = dcparser.parse(open(book.xml_file.path))
 291
 292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 295
 296         # validator, name
 297         for field in dcparser.BookInfo.FIELDS:
 298             if hasattr(book_info, field.name):
 299                 if not getattr(book_info, field.name):
 300                     continue
 301                 # since no type information is available, we use validator
 302                 type_indicator = field.validator
 303                 if type_indicator == dcparser.as_unicode:
 304                     s = getattr(book_info, field.name)
 305                     if field.multiple:
 306                         s = ', '.join(s)
 307                     try:
 308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 309                     except JavaError as je:
 310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 311                 elif type_indicator == dcparser.as_person:
 312                     p = getattr(book_info, field.name)
 313                     if isinstance(p, dcparser.Person):
 314                         persons = unicode(p)
 315                     else:
 316                         persons = ', '.join(map(unicode, p))
 317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 318                 elif type_indicator == dcparser.as_date:
 319                     dt = getattr(book_info, field.name)
 320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 322
 323         # get published date
 324         source = book_info.source_name
 325         match = self.published_date_re.search(source)
 326         if match is not None:
 327             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
 328
 329         return fields
 330
 331     def add_gaps(self, fields, fieldname):
 332         """
 333         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 334         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 335         """
 336         def gap():
 337             while True:
 338                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 339         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 340
 341     def get_master(self, root):
 342         """
 343         Returns the first master tag from an etree.
 344         """
 345         for master in root.iter():
 346             if master.tag in self.master_tags:
 347                 return master
 348
 349     def index_content(self, book, book_fields=[]):
 350         """
 351         Walks the book XML and extract content from it.
 352         Adds parts for each header tag and for each fragment.
 353         """
 354         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 355         root = wld.edoc.getroot()
 356
 357         master = self.get_master(root)
 358         if master is None:
 359             return []
 360
 361         def walker(node, ignore_tags=[]):
 362
 363             if node.tag not in ignore_tags:
 364                 yield node, None, None
 365                 if node.text is not None:
 366                     yield None, node.text, None
 367                 for child in list(node):
 368                     for b, t, e in walker(child):
 369                         yield b, t, e
 370                 yield None, None, node
 371
 372             if node.tail is not None:
 373                 yield None, node.tail, None
 374             return
 375
 376         def fix_format(text):
 377             #            separator = [u" ", u"\t", u".", u";", u","]
 378             if isinstance(text, list):
 379                 # need to join it first
 380                 text = filter(lambda s: s is not None, content)
 381                 text = u' '.join(text)
 382                 # for i in range(len(text)):
 383                 #     if i > 0:
 384                 #         if text[i][0] not in separator\
 385                 #             and text[i - 1][-1] not in separator:
 386                 #          text.insert(i, u" ")
 387
 388             return re.sub("(?m)/$", "", text)
 389
 390         def add_part(snippets, **fields):
 391             doc = self.create_book_doc(book)
 392             for f in book_fields:
 393                 doc.add(f)
 394
 395             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 396             doc.add(NumericField("header_span", Field.Store.YES, True)\
 397                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 398             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 399
 400             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 401                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 402
 403             snip_pos = snippets.add(fields["content"])
 404             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 405             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 406
 407             if 'fragment_anchor' in fields:
 408                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 409                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 410
 411             if 'themes' in fields:
 412                 themes, themes_pl = zip(*[
 413                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 414                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 415                      for theme in fields['themes']])
 416
 417                 themes = self.add_gaps(themes, 'themes')
 418                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 419
 420                 for t in themes:
 421                     doc.add(t)
 422                 for t in themes_pl:
 423                     doc.add(t)
 424
 425             return doc
 426
 427         def give_me_utf8(s):
 428             if isinstance(s, unicode):
 429                 return s.encode('utf-8')
 430             else:
 431                 return s
 432
 433         fragments = {}
 434         snippets = Snippets(book.id).open('w')
 435         try:
 436             for header, position in zip(list(master), range(len(master))):
 437
 438                 if header.tag in self.skip_header_tags:
 439                     continue
 440                 if header.tag is etree.Comment:
 441                     continue
 442
 443                 # section content
 444                 content = []
 445                 footnote = []
 446
 447                 def all_content(text):
 448                     for frag in fragments.values():
 449                         frag['content'].append(text)
 450                     content.append(text)
 451                 handle_text = [all_content]
 452
 453
 454                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 455                     # handle footnotes
 456                     if start is not None and start.tag in self.footnote_tags:
 457                         footnote = []
 458                         def collect_footnote(t):
 459                             footnote.append(t)
 460                         handle_text.append(collect_footnote)
 461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 462                         handle_text.pop()
 463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 464                                        content=u''.join(footnote),
 465                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 466
 467                         self.index.addDocument(doc)
 468                         print "@ footnote text: %s" % footnote
 469                         footnote = []
 470
 471                     # handle fragments and themes.
 472                     if start is not None and start.tag == 'begin':
 473                         fid = start.attrib['id'][1:]
 474                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 475
 476                     # themes for this fragment
 477                     elif start is not None and start.tag == 'motyw':
 478                         fid = start.attrib['id'][1:]
 479                         handle_text.append(None)
 480                         if start.text is not None:
 481                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 482                     elif end is not None and end.tag == 'motyw':
 483                         handle_text.pop()
 484
 485                     elif start is not None and start.tag == 'end':
 486                         fid = start.attrib['id'][1:]
 487                         if fid not in fragments:
 488                             continue  # a broken <end> node, skip it
 489                         frag = fragments[fid]
 490                         if frag['themes'] == []:
 491                             continue  # empty themes list.
 492                         del fragments[fid]
 493
 494                         doc = add_part(snippets,
 495                                        header_type=frag['start_header'],
 496                                        header_index=frag['start_section'],
 497                                        header_span=position - frag['start_section'] + 1,
 498                                        fragment_anchor=fid,
 499                                        content=fix_format(frag['content']),
 500                                        themes=frag['themes'])
 501                         print '@ FRAG %s' % frag['content']
 502                         self.index.addDocument(doc)
 503
 504                         # Collect content.
 505
 506                     if text is not None and handle_text is not []:
 507                         hdl = handle_text[-1]
 508                         if hdl is not None:
 509                             hdl(text)
 510
 511                         # in the end, add a section text.
 512                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 513                                content=fix_format(content))
 514                 print '@ CONTENT: %s' % fix_format(content)
 515
 516                 self.index.addDocument(doc)
 517
 518         finally:
 519             snippets.close()
 520
 521
 522 def log_exception_wrapper(f):
 523     def _wrap(*a):
 524         try:
 525             f(*a)
 526         except Exception, e:
 527             print("Error in indexing thread: %s" % e)
 528             traceback.print_exc()
 529             raise e
 530     return _wrap
 531
 532
 533 class ReusableIndex(Index):
 534     """
 535     Works like index, but does not close/optimize Lucene index
 536     until program exit (uses atexit hook).
 537     This is usefull for importbooks command.
 538
 539     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 540     """
 541     index = None
 542
 543     def open(self, analyzer=None, threads=4):
 544         if ReusableIndex.index is not None:
 545             self.index = ReusableIndex.index
 546         else:
 547             print("opening index")
 548             Index.open(self, analyzer)
 549             ReusableIndex.index = self.index
 550             atexit.register(ReusableIndex.close_reusable)
 551
 552     # def index_book(self, *args, **kw):
 553     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 554     #     ReusableIndex.pool_jobs.append(job)
 555
 556     @staticmethod
 557     def close_reusable():
 558         if ReusableIndex.index is not None:
 559             ReusableIndex.index.optimize()
 560             ReusableIndex.index.close()
 561             ReusableIndex.index = None
 562
 563     def close(self):
 564         pass
 565
 566
 567 class JoinSearch(object):
 568     """
 569     This mixin could be used to handle block join queries.
 570     (currently unused)
 571     """
 572     def __init__(self, *args, **kw):
 573         super(JoinSearch, self).__init__(*args, **kw)
 574
 575     def wrapjoins(self, query, fields=[]):
 576         """
 577         This functions modifies the query in a recursive way,
 578         so Term and Phrase Queries contained, which match
 579         provided fields are wrapped in a BlockJoinQuery,
 580         and so delegated to children documents.
 581         """
 582         if BooleanQuery.instance_(query):
 583             qs = BooleanQuery.cast_(query)
 584             for clause in qs:
 585                 clause = BooleanClause.cast_(clause)
 586                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 587             return qs
 588         else:
 589             termset = HashSet()
 590             query.extractTerms(termset)
 591             for t in termset:
 592                 t = Term.cast_(t)
 593                 if t.field() not in fields:
 594                     return query
 595             return BlockJoinQuery(query, self.parent_filter,
 596                                   BlockJoinQuery.ScoreMode.Total)
 597
 598     def bsearch(self, query, max_results=50):
 599         q = self.query(query)
 600         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 601
 602         tops = self.searcher.search(bjq, max_results)
 603         bks = []
 604         for found in tops.scoreDocs:
 605             doc = self.searcher.doc(found.doc)
 606             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 607         return (bks, tops.totalHits)
 608
 609
 610 class SearchResult(object):
 611     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 612         if tokens_cache is None: tokens_cache = {}
 613
 614         if score:
 615             self._score = score
 616         else:
 617             self._score = scoreDocs.score
 618
 619         self.boost = 1.0
 620
 621         self._hits = []
 622         self._processed_hits = None  # processed hits
 623
 624         stored = search.searcher.doc(scoreDocs.doc)
 625         self.book_id = int(stored.get("book_id"))
 626
 627         header_type = stored.get("header_type")
 628         if not header_type:
 629             return
 630
 631         sec = (header_type, int(stored.get("header_index")))
 632         header_span = stored.get('header_span')
 633         header_span = header_span is not None and int(header_span) or 1
 634
 635         fragment = stored.get("fragment_anchor")
 636
 637         pd = stored.get("published_date")
 638         if pd is None:
 639             pd = 0
 640         self.published_date = int(pd)
 641
 642         if snippets:
 643             snippets = snippets.replace("/\n", "\n")
 644         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 645
 646         self._hits.append(hit)
 647
 648         self.search = search
 649         self.searched = searched
 650         self.tokens_cache = tokens_cache
 651
 652     @property
 653     def score(self):
 654         return self._score * self.boost
 655
 656     def merge(self, other):
 657         if self.book_id != other.book_id:
 658             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 659         self._hits += other._hits
 660         if other.score > self.score:
 661             self._score = other._score
 662         return self
 663
 664     def get_book(self):
 665         return catalogue.models.Book.objects.get(id=self.book_id)
 666
 667     book = property(get_book)
 668
 669     @property
 670     def hits(self):
 671         if self._processed_hits is not None:
 672             return self._processed_hits
 673
 674         POSITION = 0
 675         FRAGMENT = 1
 676         POSITION_INDEX = 1
 677         POSITION_SPAN = 2
 678         SCORE = 2
 679         OTHER = 3
 680
 681         # to sections and fragments
 682         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 683         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 684         sect = filter(lambda s: 0 == len(filter(
 685             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 686             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 687             frags)), sect)
 688
 689         hits = []
 690
 691         # remove duplicate fragments
 692         fragments = {}
 693         for f in frags:
 694             fid = f[FRAGMENT]
 695             if fid in fragments:
 696                 if fragments[fid][SCORE] >= f[SCORE]:
 697                     continue
 698             fragments[fid] = f
 699         frags = fragments.values()
 700
 701         # remove duplicate sections
 702         sections = {}
 703
 704         for s in sect:
 705             si = s[POSITION][POSITION_INDEX]
 706             # skip existing
 707             if si in sections:
 708                 if sections[si]['score'] >= s[SCORE]:
 709                     continue
 710
 711             m = {'score': s[SCORE],
 712                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 713                  }
 714             m.update(s[OTHER])
 715             sections[si] = m
 716
 717         hits = sections.values()
 718
 719         for f in frags:
 720             try:
 721                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 722             except catalogue.models.Fragment.DoesNotExist:
 723                 # stale index
 724                 continue
 725
 726             # Figure out if we were searching for a token matching some word in theme name.
 727             themes = frag.tags.filter(category='theme')
 728             themes_hit = []
 729             if self.searched is not None:
 730                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 731                 for theme in themes:
 732                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 733                     for t in tokens:
 734                         if t in name_tokens:
 735                             if not theme in themes_hit:
 736                                 themes_hit.append(theme)
 737                             break
 738
 739             m = {'score': f[SCORE],
 740                  'fragment': frag,
 741                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 742                  'themes': themes,
 743                  'themes_hit': themes_hit
 744                  }
 745             m.update(f[OTHER])
 746             hits.append(m)
 747
 748         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 749
 750         self._processed_hits = hits
 751
 752         return hits
 753
 754     def __unicode__(self):
 755         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 756
 757     @staticmethod
 758     def aggregate(*result_lists):
 759         books = {}
 760         for rl in result_lists:
 761             for r in rl:
 762                 if r.book_id in books:
 763                     books[r.book_id].merge(r)
 764                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 765                 else:
 766                     books[r.book_id] = r
 767         return books.values()
 768
 769     def __cmp__(self, other):
 770         c = cmp(self.score, other.score)
 771         if c == 0:
 772             # this is inverted, because earlier date is better
 773             return cmp(other.published_date, self.published_date)
 774         else:
 775             return c
 776
 777
 778 class Hint(object):
 779     """
 780     Given some hint information (information we already know about)
 781     our search target - like author, title (specific book), epoch, genre, kind
 782     we can narrow down search using filters.
 783     """
 784     def __init__(self, search):
 785         """
 786         Accepts a Searcher instance.
 787         """
 788         self.search = search
 789         self.book_tags = {}
 790         self.part_tags = []
 791         self._books = []
 792
 793     def books(self, *books):
 794         """
 795         Give a hint that we search these books.
 796         """
 797         self._books = books
 798
 799     def tags(self, tags):
 800         """
 801         Give a hint that these Tag objects (a list of)
 802         is necessary.
 803         """
 804         for t in tags:
 805             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 806                 lst = self.book_tags.get(t.category, [])
 807                 lst.append(t)
 808                 self.book_tags[t.category] = lst
 809             if t.category in ['theme', 'theme_pl']:
 810                 self.part_tags.append(t)
 811
 812     def tag_filter(self, tags, field='tags'):
 813         """
 814         Given a lsit of tags and an optional field (but they are normally in tags field)
 815         returns a filter accepting only books with specific tags.
 816         """
 817         q = BooleanQuery()
 818
 819         for tag in tags:
 820             toks = self.search.get_tokens(tag.name, field=field)
 821             tag_phrase = PhraseQuery()
 822             for tok in toks:
 823                 tag_phrase.add(Term(field, tok))
 824             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 825
 826         return QueryWrapperFilter(q)
 827
 828     def book_filter(self):
 829         """
 830         Filters using book tags (all tag kinds except a theme)
 831         """
 832         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 833         if tags:
 834             return self.tag_filter(tags)
 835         else:
 836             return None
 837
 838     def part_filter(self):
 839         """
 840         This filter can be used to look for book parts.
 841         It filters on book id and/or themes.
 842         """
 843         fs = []
 844         if self.part_tags:
 845             fs.append(self.tag_filter(self.part_tags, field='themes'))
 846
 847         if self._books != []:
 848             bf = BooleanFilter()
 849             for b in self._books:
 850                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 851                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 852             fs.append(bf)
 853
 854         return Search.chain_filters(fs)
 855
 856     def should_search_for_book(self):
 857         return self._books == []
 858
 859     def just_search_in(self, all):
 860         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 861         some = []
 862         for field in all:
 863             if field == 'authors' and 'author' in self.book_tags:
 864                 continue
 865             if field == 'title' and self._books != []:
 866                 continue
 867             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 868                 continue
 869             some.append(field)
 870         return some
 871
 872
 873 class Search(IndexStore):
 874     """
 875     Search facilities.
 876     """
 877     def __init__(self, default_field="content"):
 878         IndexStore.__init__(self)
 879         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 880         # self.analyzer = WLAnalyzer()
 881         self.searcher = IndexSearcher(self.store, True)
 882         self.parser = QueryParser(Version.LUCENE_34, default_field,
 883                                   self.analyzer)
 884
 885         self.parent_filter = TermsFilter()
 886         self.parent_filter.addTerm(Term("is_book", "true"))
 887
 888     def query(self, query):
 889         """Parse query in default Lucene Syntax. (for humans)
 890         """
 891         return self.parser.parse(query)
 892
 893     def simple_search(self, query, max_results=50):
 894         """Runs a query for books using lucene syntax. (for humans)
 895         Returns (books, total_hits)
 896         """
 897
 898         tops = self.searcher.search(self.query(query), max_results)
 899         bks = []
 900         for found in tops.scoreDocs:
 901             doc = self.searcher.doc(found.doc)
 902             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 903         return (bks, tops.totalHits)
 904
 905     def get_tokens(self, searched, field='content', cached=None):
 906         """returns tokens analyzed by a proper (for a field) analyzer
 907         argument can be: StringReader, string/unicode, or tokens. In the last case
 908         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 909         """
 910         if cached is not None and field in cached:
 911             return cached[field]
 912
 913         if isinstance(searched, str) or isinstance(searched, unicode):
 914             searched = StringReader(searched)
 915         elif isinstance(searched, list):
 916             return searched
 917
 918         searched.reset()
 919         tokens = self.analyzer.reusableTokenStream(field, searched)
 920         toks = []
 921         while tokens.incrementToken():
 922             cta = tokens.getAttribute(CharTermAttribute.class_)
 923             toks.append(cta.toString())
 924
 925         if cached is not None:
 926             cached[field] = toks
 927
 928         return toks
 929
 930     def fuzziness(self, fuzzy):
 931         """Helper method to sanitize fuzziness"""
 932         if not fuzzy:
 933             return None
 934         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 935             return fuzzy
 936         else:
 937             return 0.5
 938
 939     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 940         """
 941         Return a PhraseQuery with a series of tokens.
 942         """
 943         if fuzzy:
 944             phrase = MultiPhraseQuery()
 945             for t in tokens:
 946                 term = Term(field, t)
 947                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 948                 fuzzterms = []
 949
 950                 while True:
 951                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 952                     ft = fuzzterm.term()
 953                     if ft:
 954                         fuzzterms.append(ft)
 955                     if not fuzzterm.next(): break
 956                 if fuzzterms:
 957                     phrase.add(JArray('object')(fuzzterms, Term))
 958                 else:
 959                     phrase.add(term)
 960         else:
 961             phrase = PhraseQuery()
 962             phrase.setSlop(slop)
 963             for t in tokens:
 964                 term = Term(field, t)
 965                 phrase.add(term)
 966         return phrase
 967
 968     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 969         """
 970         Returns term queries joined by boolean query.
 971         modal - applies to boolean query
 972         fuzzy - should the query by fuzzy.
 973         """
 974         q = BooleanQuery()
 975         for t in tokens:
 976             term = Term(field, t)
 977             if fuzzy:
 978                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 979             else:
 980                 term = TermQuery(term)
 981             q.add(BooleanClause(term, modal))
 982         return q
 983
 984     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 985                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 986         if filters is None: filters = []
 987         if tokens_cache is None: tokens_cache = {}
 988
 989         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 990
 991         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 992         if book:
 993             filters.append(self.term_filter(Term('is_book', 'true')))
 994         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 995
 996         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 997
 998     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 999                     filters=None, tokens_cache=None, boost=None, snippets=True):
1000         if filters is None: filters = []
1001         if tokens_cache is None: tokens_cache = {}
1002
1003         if book:
1004             filters.append(self.term_filter(Term('is_book', 'true')))
1005
1006         query = BooleanQuery()
1007
1008         for fld in fields:
1009             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1010
1011             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1012                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1013
1014         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1015
1016         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1017                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1018
1019     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1020         """
1021         Search for perfect book matches. Just see if the query matches with some author or title,
1022         taking hints into account.
1023         """
1024         fields_to_search = ['authors', 'title']
1025         only_in = None
1026         if hint:
1027             if not hint.should_search_for_book():
1028                 return []
1029             fields_to_search = hint.just_search_in(fields_to_search)
1030             only_in = hint.book_filter()
1031
1032         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1033
1034         books = []
1035         for q in qrys:
1036             top = self.searcher.search(q,
1037                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1038                 max_results)
1039             for found in top.scoreDocs:
1040                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1041         return books
1042
1043     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1044         fields_to_search = ['tags', 'authors', 'title']
1045
1046         only_in = None
1047         if hint:
1048             if not hint.should_search_for_book():
1049                 return []
1050             fields_to_search = hint.just_search_in(fields_to_search)
1051             only_in = hint.book_filter()
1052
1053         tokens = self.get_tokens(searched, field='SIMPLE')
1054
1055         q = BooleanQuery()
1056
1057         for fld in fields_to_search:
1058             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1059                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1060
1061         books = []
1062         top = self.searcher.search(q,
1063                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1064             max_results)
1065         for found in top.scoreDocs:
1066             books.append(SearchResult(self, found, how_found="search_book"))
1067
1068         return books
1069
1070     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1071         """
1072         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1073         some part/fragment of the book.
1074         """
1075         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1076
1077         flt = None
1078         if hint:
1079             flt = hint.part_filter()
1080
1081         books = []
1082         for q in qrys:
1083             top = self.searcher.search(q,
1084                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1085                                                            flt]),
1086                                        max_results)
1087             for found in top.scoreDocs:
1088                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1089
1090         return books
1091
1092     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1093         """
1094         Tries to use search terms to match different fields of book (or its parts).
1095         E.g. one word can be an author survey, another be a part of the title, and the rest
1096         are some words from third chapter.
1097         """
1098         if tokens_cache is None: tokens_cache = {}
1099         books = []
1100         only_in = None
1101
1102         if hint:
1103             only_in = hint.part_filter()
1104
1105         # content only query : themes x content
1106         q = BooleanQuery()
1107
1108         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1109         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1110
1111         # only search in themes when we do not already filter by themes
1112         if hint is None or hint.just_search_in(['themes']) != []:
1113             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1114                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1115
1116         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1117                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1118
1119         topDocs = self.searcher.search(q, only_in, max_results)
1120         for found in topDocs.scoreDocs:
1121             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1122             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1123
1124         # query themes/content x author/title/tags
1125         q = BooleanQuery()
1126         in_content = BooleanQuery()
1127         in_meta = BooleanQuery()
1128
1129         for fld in ['themes_pl', 'content']:
1130             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1131
1132         for fld in ['tags', 'authors', 'title']:
1133             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1134
1135         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1136         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1137
1138         topDocs = self.searcher.search(q, only_in, max_results)
1139         for found in topDocs.scoreDocs:
1140             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1141             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1142
1143         return books
1144
1145     # def multisearch(self, query, max_results=50):
1146     #     """
1147     #     Search strategy:
1148     #     - (phrase) OR -> content
1149     #                   -> title
1150     #                   -> authors
1151     #     - (keywords)  -> authors
1152     #                   -> motyw
1153     #                   -> tags
1154     #                   -> content
1155     #     """
1156         # queryreader = StringReader(query)
1157         # tokens = self.get_tokens(queryreader)
1158
1159         # top_level = BooleanQuery()
1160         # Should = BooleanClause.Occur.SHOULD
1161
1162         # phrase_level = BooleanQuery()
1163         # phrase_level.setBoost(1.3)
1164
1165         # p_content = self.make_phrase(tokens, joined=True)
1166         # p_title = self.make_phrase(tokens, 'title')
1167         # p_author = self.make_phrase(tokens, 'author')
1168
1169         # phrase_level.add(BooleanClause(p_content, Should))
1170         # phrase_level.add(BooleanClause(p_title, Should))
1171         # phrase_level.add(BooleanClause(p_author, Should))
1172
1173         # kw_level = BooleanQuery()
1174
1175         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1176         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1177         # kw_level.add(j_themes, Should)
1178         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1179         # j_con = self.make_term_query(tokens, joined=True)
1180         # kw_level.add(j_con, Should)
1181
1182         # top_level.add(BooleanClause(phrase_level, Should))
1183         # top_level.add(BooleanClause(kw_level, Should))
1184
1185         # return None
1186
1187     def get_snippets(self, scoreDoc, query, field='content'):
1188         """
1189         Returns a snippet for found scoreDoc.
1190         """
1191         htmlFormatter = SimpleHTMLFormatter()
1192         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1193
1194         stored = self.searcher.doc(scoreDoc.doc)
1195
1196         position = stored.get('snippets_position')
1197         length = stored.get('snippets_length')
1198         if position is None or length is None:
1199             return None
1200         # locate content.
1201         snippets = Snippets(stored.get('book_id')).open()
1202         try:
1203             text = snippets.get((int(position),
1204                                  int(length)))
1205         finally:
1206             snippets.close()
1207
1208         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1209         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1210         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1211
1212         return snip
1213
1214     @staticmethod
1215     def enum_to_array(enum):
1216         """
1217         Converts a lucene TermEnum to array of Terms, suitable for
1218         addition to queries
1219         """
1220         terms = []
1221
1222         while True:
1223             t = enum.term()
1224             if t:
1225                 terms.append(t)
1226             if not enum.next(): break
1227
1228         if terms:
1229             return JArray('object')(terms, Term)
1230
1231     def search_tags(self, query, filter=None, max_results=40):
1232         """
1233         Search for Tag objects using query.
1234         """
1235         tops = self.searcher.search(query, filter, max_results)
1236
1237         tags = []
1238         for found in tops.scoreDocs:
1239             doc = self.searcher.doc(found.doc)
1240             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1241             tags.append(tag)
1242             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1243
1244         return tags
1245
1246     def search_books(self, query, filter=None, max_results=10):
1247         """
1248         Searches for Book objects using query
1249         """
1250         bks = []
1251         tops = self.searcher.search(query, filter, max_results)
1252         for found in tops.scoreDocs:
1253             doc = self.searcher.doc(found.doc)
1254             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1255         return bks
1256
1257     def create_prefix_phrase(self, toks, field):
1258         q = MultiPhraseQuery()
1259         for i in range(len(toks)):
1260             t = Term(field, toks[i])
1261             if i == len(toks) - 1:
1262                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1263                 if pterms:
1264                     q.add(pterms)
1265                 else:
1266                     q.add(t)
1267             else:
1268                 q.add(t)
1269         return q
1270
1271     @staticmethod
1272     def term_filter(term, inverse=False):
1273         only_term = TermsFilter()
1274         only_term.addTerm(term)
1275
1276         if inverse:
1277             neg = BooleanFilter()
1278             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1279             only_term = neg
1280
1281         return only_term
1282
1283     def hint_tags(self, string, max_results=50):
1284         """
1285         Return auto-complete hints for tags
1286         using prefix search.
1287         """
1288         toks = self.get_tokens(string, field='SIMPLE')
1289         top = BooleanQuery()
1290
1291         for field in ['tag_name', 'tag_name_pl']:
1292             q = self.create_prefix_phrase(toks, field)
1293             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1294
1295         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1296
1297         return self.search_tags(top, no_book_cat, max_results=max_results)
1298
1299     def hint_books(self, string, max_results=50):
1300         """
1301         Returns auto-complete hints for book titles
1302         Because we do not index 'pseudo' title-tags.
1303         Prefix search.
1304         """
1305         toks = self.get_tokens(string, field='SIMPLE')
1306
1307         q = self.create_prefix_phrase(toks, 'title')
1308
1309         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1310
1311     @staticmethod
1312     def chain_filters(filters, op=ChainedFilter.AND):
1313         """
1314         Chains a filter list together
1315         """
1316         filters = filter(lambda x: x is not None, filters)
1317         if not filters or filters is []:
1318             return None
1319         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1320         return chf
1321
1322     def filtered_categories(self, tags):
1323         """
1324         Return a list of tag categories, present in tags list.
1325         """
1326         cats = {}
1327         for t in tags:
1328             cats[t.category] = True
1329         return cats.keys()
1330
1331     def hint(self):
1332         return Hint(self)