apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from multiprocessing.pool import ThreadPool
  31 from threading import current_thread
  32 import atexit
  33 import traceback
  34
  35
  36 class WLAnalyzer(PerFieldAnalyzerWrapper):
  37     def __init__(self):
  38         polish = PolishAnalyzer(Version.LUCENE_34)
  39         #        polish_gap.setPositionIncrementGap(999)
  40
  41         simple = SimpleAnalyzer(Version.LUCENE_34)
  42         #        simple_gap.setPositionIncrementGap(999)
  43
  44         keyword = KeywordAnalyzer(Version.LUCENE_34)
  45
  46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  47
  48         PerFieldAnalyzerWrapper.__init__(self, polish)
  49
  50         self.addAnalyzer("tags", simple)
  51         self.addAnalyzer("technical_editors", simple)
  52         self.addAnalyzer("editors", simple)
  53         self.addAnalyzer("url", keyword)
  54         self.addAnalyzer("source_url", keyword)
  55         self.addAnalyzer("source_name", simple)
  56         self.addAnalyzer("publisher", simple)
  57         self.addAnalyzer("authors", simple)
  58         self.addAnalyzer("title", simple)
  59
  60         self.addAnalyzer("is_book", keyword)
  61         # shouldn't the title have two forms? _pl and simple?
  62
  63         self.addAnalyzer("themes", simple)
  64         self.addAnalyzer("themes_pl", polish)
  65
  66         self.addAnalyzer("tag_name", simple)
  67         self.addAnalyzer("tag_name_pl", polish)
  68
  69         self.addAnalyzer("translators", simple)
  70
  71         self.addAnalyzer("KEYWORD", keyword)
  72         self.addAnalyzer("SIMPLE", simple)
  73         self.addAnalyzer("POLISH", polish)
  74
  75
  76 class IndexStore(object):
  77     """
  78     Provides access to search index.
  79
  80     self.store - lucene index directory
  81     """
  82     def __init__(self):
  83         self.make_index_dir()
  84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  85
  86     def make_index_dir(self):
  87         try:
  88             os.makedirs(settings.SEARCH_INDEX)
  89         except OSError as exc:
  90             if exc.errno == errno.EEXIST:
  91                 pass
  92             else: raise
  93
  94
  95 class IndexChecker(IndexStore):
  96     def __init__(self):
  97         IndexStore.__init__(self)
  98
  99     def check(self):
 100         checker = CheckIndex(self.store)
 101         status = checker.checkIndex()
 102         return status
 103
 104
 105 class Snippets(object):
 106     """
 107     This class manages snippet files for indexed object (book)
 108     the snippets are concatenated together, and their positions and
 109     lengths are kept in lucene index fields.
 110     """
 111     SNIPPET_DIR = "snippets"
 112
 113     def __init__(self, book_id):
 114         try:
 115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 116         except OSError as exc:
 117             if exc.errno == errno.EEXIST:
 118                 pass
 119             else: raise
 120         self.book_id = book_id
 121         self.file = None
 122
 123     def open(self, mode='r'):
 124         """
 125         Open the snippet file. Call .close() afterwards.
 126         """
 127         if not 'b' in mode:
 128             mode += 'b'
 129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 130         self.position = 0
 131         return self
 132
 133     def add(self, snippet):
 134         """
 135         Append a snippet (unicode) to the snippet file.
 136         Return a (position, length) tuple
 137         """
 138         txt = snippet.encode('utf-8')
 139         l = len(txt)
 140         self.file.write(txt)
 141         pos = (self.position, l)
 142         self.position += l
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad',
 268         ]
 269
 270     ignore_content_tags = [
 271         'uwaga', 'extra',
 272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 273         'didaskalia',
 274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 275         ]
 276
 277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 278
 279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 280
 281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 282
 283     def extract_metadata(self, book, book_info=None):
 284         """
 285         Extract metadata from book and returns a map of fields keyed by fieldname
 286         """
 287         fields = {}
 288
 289         if book_info is None:
 290             book_info = dcparser.parse(open(book.xml_file.path))
 291
 292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 295
 296         # validator, name
 297         for field in dcparser.BookInfo.FIELDS:
 298             if hasattr(book_info, field.name):
 299                 if not getattr(book_info, field.name):
 300                     continue
 301                 # since no type information is available, we use validator
 302                 type_indicator = field.validator
 303                 if type_indicator == dcparser.as_unicode:
 304                     s = getattr(book_info, field.name)
 305                     if field.multiple:
 306                         s = ', '.join(s)
 307                     try:
 308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 309                     except JavaError as je:
 310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 311                 elif type_indicator == dcparser.as_person:
 312                     p = getattr(book_info, field.name)
 313                     if isinstance(p, dcparser.Person):
 314                         persons = unicode(p)
 315                     else:
 316                         persons = ', '.join(map(unicode, p))
 317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 318                 elif type_indicator == dcparser.as_date:
 319                     dt = getattr(book_info, field.name)
 320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 322
 323         # get published date
 324         source = book_info.source_name
 325         match = self.published_date_re.search(source)
 326         print("published date is %s %s" % (match, match is not None and match.groups()))
 327         if match is not None:
 328             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
 329
 330         return fields
 331
 332     def add_gaps(self, fields, fieldname):
 333         """
 334         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 335         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 336         """
 337         def gap():
 338             while True:
 339                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 340         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 341
 342     def get_master(self, root):
 343         """
 344         Returns the first master tag from an etree.
 345         """
 346         for master in root.iter():
 347             if master.tag in self.master_tags:
 348                 return master
 349
 350     def index_content(self, book, book_fields=[]):
 351         """
 352         Walks the book XML and extract content from it.
 353         Adds parts for each header tag and for each fragment.
 354         """
 355         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 356         root = wld.edoc.getroot()
 357
 358         master = self.get_master(root)
 359         if master is None:
 360             return []
 361
 362         def walker(node, ignore_tags=[]):
 363             yield node, None
 364             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
 365                 for b, e in walker(child):
 366                     yield b, e
 367             yield None, node
 368             return
 369
 370         def fix_format(text):
 371             #            separator = [u" ", u"\t", u".", u";", u","]
 372             if isinstance(text, list):
 373                 # need to join it first
 374                 text = filter(lambda s: s is not None, content)
 375                 text = u' '.join(text)
 376                 # for i in range(len(text)):
 377                 #     if i > 0:
 378                 #         if text[i][0] not in separator\
 379                 #             and text[i - 1][-1] not in separator:
 380                 #          text.insert(i, u" ")
 381
 382             return re.sub("(?m)/$", "", text)
 383
 384         def add_part(snippets, **fields):
 385             doc = self.create_book_doc(book)
 386             for f in book_fields:
 387                 doc.add(f)
 388
 389             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 390             doc.add(NumericField("header_span", Field.Store.YES, True)\
 391                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 392             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 393
 394             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 395                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 396
 397             snip_pos = snippets.add(fields["content"])
 398             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 399             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 400
 401             if 'fragment_anchor' in fields:
 402                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 403                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 404
 405             if 'themes' in fields:
 406                 themes, themes_pl = zip(*[
 407                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 408                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 409                      for theme in fields['themes']])
 410
 411                 themes = self.add_gaps(themes, 'themes')
 412                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 413
 414                 for t in themes:
 415                     doc.add(t)
 416                 for t in themes_pl:
 417                     doc.add(t)
 418
 419             return doc
 420
 421         def give_me_utf8(s):
 422             if isinstance(s, unicode):
 423                 return s.encode('utf-8')
 424             else:
 425                 return s
 426
 427         fragments = {}
 428         snippets = Snippets(book.id).open('w')
 429         position = 0
 430         try:
 431             for header in list(master):
 432
 433                 if header.tag in self.skip_header_tags:
 434                     continue
 435                 if header.tag is etree.Comment:
 436                     continue
 437
 438                 # section content
 439                 content = []
 440                 footnote = None
 441
 442                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
 443                     # handle footnotes
 444                     if start is not None and start.tag in self.footnote_tags:
 445                         footnote = ' '.join(start.itertext())
 446                     elif end is not None and footnote is not None and end.tag in self.footnote_tags:
 447                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 448                                        content=footnote)
 449
 450                         self.index.addDocument(doc)
 451
 452                         footnote = None
 453
 454                     # handle fragments and themes.
 455                     if start is not None and start.tag == 'begin':
 456                         fid = start.attrib['id'][1:]
 457                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 458
 459                     elif start is not None and start.tag == 'motyw':
 460                         fid = start.attrib['id'][1:]
 461                         if start.text is not None:
 462                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 463
 464                     elif start is not None and start.tag == 'end':
 465                         fid = start.attrib['id'][1:]
 466                         if fid not in fragments:
 467                             continue  # a broken <end> node, skip it
 468                                       #                        import pdb; pdb.set_trace()
 469                         frag = fragments[fid]
 470                         if frag['themes'] == []:
 471                             continue  # empty themes list.
 472                         del fragments[fid]
 473
 474                         doc = add_part(snippets,
 475                                        header_type=frag['start_header'],
 476                                        header_index=frag['start_section'],
 477                                        header_span=position - frag['start_section'] + 1,
 478                                        fragment_anchor=fid,
 479                                        content=fix_format(frag['content']),
 480                                        themes=frag['themes'])
 481
 482                         self.index.addDocument(doc)
 483
 484                         # Collect content.
 485                     elif start is not None:
 486                         for frag in fragments.values():
 487                             frag['content'].append(start.text)
 488                         content.append(start.text)
 489                     elif end is not None:
 490                         for frag in fragments.values():
 491                             frag['content'].append(end.tail)
 492                         content.append(end.tail)
 493
 494                         # in the end, add a section text.
 495                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 496                                content=fix_format(content))
 497
 498                 self.index.addDocument(doc)
 499                 position += 1
 500
 501         finally:
 502             snippets.close()
 503
 504
 505 def log_exception_wrapper(f):
 506     def _wrap(*a):
 507         try:
 508             f(*a)
 509         except Exception, e:
 510             print("Error in indexing thread: %s" % e)
 511             traceback.print_exc()
 512             raise e
 513     return _wrap
 514
 515
 516 class ReusableIndex(Index):
 517     """
 518     Works like index, but does not close/optimize Lucene index
 519     until program exit (uses atexit hook).
 520     This is usefull for importbooks command.
 521
 522     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 523     """
 524     index = None
 525
 526     def open(self, analyzer=None, threads=4):
 527         if ReusableIndex.index is not None:
 528             self.index = ReusableIndex.index
 529         else:
 530             print("opening index")
 531             Index.open(self, analyzer)
 532             ReusableIndex.index = self.index
 533             atexit.register(ReusableIndex.close_reusable)
 534
 535     # def index_book(self, *args, **kw):
 536     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 537     #     ReusableIndex.pool_jobs.append(job)
 538
 539     @staticmethod
 540     def close_reusable():
 541         if ReusableIndex.index is not None:
 542             ReusableIndex.index.optimize()
 543             ReusableIndex.index.close()
 544             ReusableIndex.index = None
 545
 546     def close(self):
 547         pass
 548
 549
 550 class JoinSearch(object):
 551     """
 552     This mixin could be used to handle block join queries.
 553     (currently unused)
 554     """
 555     def __init__(self, *args, **kw):
 556         super(JoinSearch, self).__init__(*args, **kw)
 557
 558     def wrapjoins(self, query, fields=[]):
 559         """
 560         This functions modifies the query in a recursive way,
 561         so Term and Phrase Queries contained, which match
 562         provided fields are wrapped in a BlockJoinQuery,
 563         and so delegated to children documents.
 564         """
 565         if BooleanQuery.instance_(query):
 566             qs = BooleanQuery.cast_(query)
 567             for clause in qs:
 568                 clause = BooleanClause.cast_(clause)
 569                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 570             return qs
 571         else:
 572             termset = HashSet()
 573             query.extractTerms(termset)
 574             for t in termset:
 575                 t = Term.cast_(t)
 576                 if t.field() not in fields:
 577                     return query
 578             return BlockJoinQuery(query, self.parent_filter,
 579                                   BlockJoinQuery.ScoreMode.Total)
 580
 581     def bsearch(self, query, max_results=50):
 582         q = self.query(query)
 583         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 584
 585         tops = self.searcher.search(bjq, max_results)
 586         bks = []
 587         for found in tops.scoreDocs:
 588             doc = self.searcher.doc(found.doc)
 589             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 590         return (bks, tops.totalHits)
 591
 592
 593 class SearchResult(object):
 594     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 595         if tokens_cache is None: tokens_cache = {}
 596
 597         if score:
 598             self._score = score
 599         else:
 600             self._score = scoreDocs.score
 601
 602         self.boost = 1.0
 603
 604         self._hits = []
 605         self._processed_hits = None  # processed hits
 606
 607         stored = search.searcher.doc(scoreDocs.doc)
 608         self.book_id = int(stored.get("book_id"))
 609
 610         header_type = stored.get("header_type")
 611         if not header_type:
 612             return
 613
 614         sec = (header_type, int(stored.get("header_index")))
 615         header_span = stored.get('header_span')
 616         header_span = header_span is not None and int(header_span) or 1
 617
 618         fragment = stored.get("fragment_anchor")
 619
 620         pd = stored.get("published_date")
 621         if pd is None:
 622             print "published_date is none for book %d" % self.book_id
 623             pd = 0
 624         self.published_date = int(pd)
 625
 626         if snippets:
 627             snippets = snippets.replace("/\n", "\n")
 628         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 629
 630         self._hits.append(hit)
 631
 632         self.search = search
 633         self.searched = searched
 634         self.tokens_cache = tokens_cache
 635
 636     @property
 637     def score(self):
 638         return self._score * self.boost
 639
 640     def merge(self, other):
 641         if self.book_id != other.book_id:
 642             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 643         self._hits += other._hits
 644         if other.score > self.score:
 645             self._score = other._score
 646         return self
 647
 648     def get_book(self):
 649         return catalogue.models.Book.objects.get(id=self.book_id)
 650
 651     book = property(get_book)
 652
 653     @property
 654     def hits(self):
 655         if self._processed_hits is not None:
 656             return self._processed_hits
 657
 658         POSITION = 0
 659         FRAGMENT = 1
 660         POSITION_INDEX = 1
 661         POSITION_SPAN = 2
 662         SCORE = 2
 663         OTHER = 3
 664
 665         # to sections and fragments
 666         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 667         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 668         sect = filter(lambda s: 0 == len(filter(
 669             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 670             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 671             frags)), sect)
 672
 673         hits = []
 674
 675         # remove duplicate fragments
 676         fragments = {}
 677         for f in frags:
 678             fid = f[FRAGMENT]
 679             if fid in fragments:
 680                 if fragments[fid][SCORE] >= f[SCORE]:
 681                     continue
 682             fragments[fid] = f
 683         frags = fragments.values()
 684
 685         # remove duplicate sections
 686         sections = {}
 687
 688         for s in sect:
 689             si = s[POSITION][POSITION_INDEX]
 690             # skip existing
 691             if si in sections:
 692                 if sections[si]['score'] >= s[SCORE]:
 693                     continue
 694
 695             m = {'score': s[SCORE],
 696                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 697                  }
 698             m.update(s[OTHER])
 699             sections[si] = m
 700
 701         hits = sections.values()
 702
 703         for f in frags:
 704             try:
 705                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 706             except catalogue.models.Fragment.DoesNotExist:
 707                 # stale index
 708                 continue
 709
 710             # Figure out if we were searching for a token matching some word in theme name.
 711             themes = frag.tags.filter(category='theme')
 712             themes_hit = []
 713             if self.searched is not None:
 714                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 715                 for theme in themes:
 716                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 717                     for t in tokens:
 718                         if t in name_tokens:
 719                             if not theme in themes_hit:
 720                                 themes_hit.append(theme)
 721                             break
 722
 723             m = {'score': f[SCORE],
 724                  'fragment': frag,
 725                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 726                  'themes': themes,
 727                  'themes_hit': themes_hit
 728                  }
 729             m.update(f[OTHER])
 730             hits.append(m)
 731
 732         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 733
 734         self._processed_hits = hits
 735
 736         return hits
 737
 738     def __unicode__(self):
 739         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 740
 741     @staticmethod
 742     def aggregate(*result_lists):
 743         books = {}
 744         for rl in result_lists:
 745             for r in rl:
 746                 if r.book_id in books:
 747                     books[r.book_id].merge(r)
 748                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 749                 else:
 750                     books[r.book_id] = r
 751         return books.values()
 752
 753     def __cmp__(self, other):
 754         c = cmp(self.score, other.score)
 755         if c == 0:
 756             # this is inverted, because earlier date is better
 757             return cmp(other.published_date, self.published_date)
 758         else:
 759             return c
 760
 761
 762 class Hint(object):
 763     """
 764     Given some hint information (information we already know about)
 765     our search target - like author, title (specific book), epoch, genre, kind
 766     we can narrow down search using filters.
 767     """
 768     def __init__(self, search):
 769         """
 770         Accepts a Searcher instance.
 771         """
 772         self.search = search
 773         self.book_tags = {}
 774         self.part_tags = []
 775         self._books = []
 776
 777     def books(self, *books):
 778         """
 779         Give a hint that we search these books.
 780         """
 781         self._books = books
 782
 783     def tags(self, tags):
 784         """
 785         Give a hint that these Tag objects (a list of)
 786         is necessary.
 787         """
 788         for t in tags:
 789             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 790                 lst = self.book_tags.get(t.category, [])
 791                 lst.append(t)
 792                 self.book_tags[t.category] = lst
 793             if t.category in ['theme', 'theme_pl']:
 794                 self.part_tags.append(t)
 795
 796     def tag_filter(self, tags, field='tags'):
 797         """
 798         Given a lsit of tags and an optional field (but they are normally in tags field)
 799         returns a filter accepting only books with specific tags.
 800         """
 801         q = BooleanQuery()
 802
 803         for tag in tags:
 804             toks = self.search.get_tokens(tag.name, field=field)
 805             tag_phrase = PhraseQuery()
 806             for tok in toks:
 807                 tag_phrase.add(Term(field, tok))
 808             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 809
 810         return QueryWrapperFilter(q)
 811
 812     def book_filter(self):
 813         """
 814         Filters using book tags (all tag kinds except a theme)
 815         """
 816         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 817         if tags:
 818             return self.tag_filter(tags)
 819         else:
 820             return None
 821
 822     def part_filter(self):
 823         """
 824         This filter can be used to look for book parts.
 825         It filters on book id and/or themes.
 826         """
 827         fs = []
 828         if self.part_tags:
 829             fs.append(self.tag_filter(self.part_tags, field='themes'))
 830
 831         if self._books != []:
 832             bf = BooleanFilter()
 833             for b in self._books:
 834                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 835                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 836             fs.append(bf)
 837
 838         return Search.chain_filters(fs)
 839
 840     def should_search_for_book(self):
 841         return self._books == []
 842
 843     def just_search_in(self, all):
 844         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 845         some = []
 846         for field in all:
 847             if field == 'authors' and 'author' in self.book_tags:
 848                 continue
 849             if field == 'title' and self._books != []:
 850                 continue
 851             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 852                 continue
 853             some.append(field)
 854         return some
 855
 856
 857 class Search(IndexStore):
 858     """
 859     Search facilities.
 860     """
 861     def __init__(self, default_field="content"):
 862         IndexStore.__init__(self)
 863         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 864         # self.analyzer = WLAnalyzer()
 865         self.searcher = IndexSearcher(self.store, True)
 866         self.parser = QueryParser(Version.LUCENE_34, default_field,
 867                                   self.analyzer)
 868
 869         self.parent_filter = TermsFilter()
 870         self.parent_filter.addTerm(Term("is_book", "true"))
 871
 872     def query(self, query):
 873         """Parse query in default Lucene Syntax. (for humans)
 874         """
 875         return self.parser.parse(query)
 876
 877     def simple_search(self, query, max_results=50):
 878         """Runs a query for books using lucene syntax. (for humans)
 879         Returns (books, total_hits)
 880         """
 881
 882         tops = self.searcher.search(self.query(query), max_results)
 883         bks = []
 884         for found in tops.scoreDocs:
 885             doc = self.searcher.doc(found.doc)
 886             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 887         return (bks, tops.totalHits)
 888
 889     def get_tokens(self, searched, field='content', cached=None):
 890         """returns tokens analyzed by a proper (for a field) analyzer
 891         argument can be: StringReader, string/unicode, or tokens. In the last case
 892         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 893         """
 894         if cached is not None and field in cached:
 895             return cached[field]
 896
 897         if isinstance(searched, str) or isinstance(searched, unicode):
 898             searched = StringReader(searched)
 899         elif isinstance(searched, list):
 900             return searched
 901
 902         searched.reset()
 903         tokens = self.analyzer.reusableTokenStream(field, searched)
 904         toks = []
 905         while tokens.incrementToken():
 906             cta = tokens.getAttribute(CharTermAttribute.class_)
 907             toks.append(cta.toString())
 908
 909         if cached is not None:
 910             cached[field] = toks
 911
 912         return toks
 913
 914     def fuzziness(self, fuzzy):
 915         """Helper method to sanitize fuzziness"""
 916         if not fuzzy:
 917             return None
 918         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 919             return fuzzy
 920         else:
 921             return 0.5
 922
 923     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 924         """
 925         Return a PhraseQuery with a series of tokens.
 926         """
 927         if fuzzy:
 928             phrase = MultiPhraseQuery()
 929             for t in tokens:
 930                 term = Term(field, t)
 931                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 932                 fuzzterms = []
 933
 934                 while True:
 935                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 936                     ft = fuzzterm.term()
 937                     if ft:
 938                         fuzzterms.append(ft)
 939                     if not fuzzterm.next(): break
 940                 if fuzzterms:
 941                     phrase.add(JArray('object')(fuzzterms, Term))
 942                 else:
 943                     phrase.add(term)
 944         else:
 945             phrase = PhraseQuery()
 946             phrase.setSlop(slop)
 947             for t in tokens:
 948                 term = Term(field, t)
 949                 phrase.add(term)
 950         return phrase
 951
 952     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 953         """
 954         Returns term queries joined by boolean query.
 955         modal - applies to boolean query
 956         fuzzy - should the query by fuzzy.
 957         """
 958         q = BooleanQuery()
 959         for t in tokens:
 960             term = Term(field, t)
 961             if fuzzy:
 962                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 963             else:
 964                 term = TermQuery(term)
 965             q.add(BooleanClause(term, modal))
 966         return q
 967
 968     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 969                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 970         if filters is None: filters = []
 971         if tokens_cache is None: tokens_cache = {}
 972
 973         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 974
 975         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 976         if book:
 977             filters.append(self.term_filter(Term('is_book', 'true')))
 978         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 979
 980         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 981
 982     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 983                     filters=None, tokens_cache=None, boost=None, snippets=True):
 984         if filters is None: filters = []
 985         if tokens_cache is None: tokens_cache = {}
 986
 987         if book:
 988             filters.append(self.term_filter(Term('is_book', 'true')))
 989
 990         query = BooleanQuery()
 991
 992         for fld in fields:
 993             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 994
 995             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 996                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 997
 998         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 999
1000         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1001                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1002
1003     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1004         """
1005         Search for perfect book matches. Just see if the query matches with some author or title,
1006         taking hints into account.
1007         """
1008         fields_to_search = ['authors', 'title']
1009         only_in = None
1010         if hint:
1011             if not hint.should_search_for_book():
1012                 return []
1013             fields_to_search = hint.just_search_in(fields_to_search)
1014             only_in = hint.book_filter()
1015
1016         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1017
1018         books = []
1019         for q in qrys:
1020             top = self.searcher.search(q,
1021                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1022                 max_results)
1023             for found in top.scoreDocs:
1024                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1025         return books
1026
1027     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1028         fields_to_search = ['tags', 'authors', 'title']
1029
1030         only_in = None
1031         if hint:
1032             if not hint.should_search_for_book():
1033                 return []
1034             fields_to_search = hint.just_search_in(fields_to_search)
1035             only_in = hint.book_filter()
1036
1037         tokens = self.get_tokens(searched, field='SIMPLE')
1038
1039         q = BooleanQuery()
1040
1041         for fld in fields_to_search:
1042             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1043                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1044
1045         books = []
1046         top = self.searcher.search(q,
1047                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1048             max_results)
1049         for found in top.scoreDocs:
1050             books.append(SearchResult(self, found, how_found="search_book"))
1051
1052         return books
1053
1054     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1055         """
1056         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1057         some part/fragment of the book.
1058         """
1059         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1060
1061         flt = None
1062         if hint:
1063             flt = hint.part_filter()
1064
1065         books = []
1066         for q in qrys:
1067             top = self.searcher.search(q,
1068                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1069                                                            flt]),
1070                                        max_results)
1071             for found in top.scoreDocs:
1072                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1073
1074         return books
1075
1076     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1077         """
1078         Tries to use search terms to match different fields of book (or its parts).
1079         E.g. one word can be an author survey, another be a part of the title, and the rest
1080         are some words from third chapter.
1081         """
1082         if tokens_cache is None: tokens_cache = {}
1083         books = []
1084         only_in = None
1085
1086         if hint:
1087             only_in = hint.part_filter()
1088
1089         # content only query : themes x content
1090         q = BooleanQuery()
1091
1092         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1093         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1094
1095         # only search in themes when we do not already filter by themes
1096         if hint is None or hint.just_search_in(['themes']) != []:
1097             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1098                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1099
1100         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1101                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1102
1103         topDocs = self.searcher.search(q, only_in, max_results)
1104         for found in topDocs.scoreDocs:
1105             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1106             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1107
1108         # query themes/content x author/title/tags
1109         q = BooleanQuery()
1110         in_content = BooleanQuery()
1111         in_meta = BooleanQuery()
1112
1113         for fld in ['themes_pl', 'content']:
1114             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1115
1116         for fld in ['tags', 'authors', 'title']:
1117             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1118
1119         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1120         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1121
1122         topDocs = self.searcher.search(q, only_in, max_results)
1123         for found in topDocs.scoreDocs:
1124             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1125             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1126
1127         return books
1128
1129     # def multisearch(self, query, max_results=50):
1130     #     """
1131     #     Search strategy:
1132     #     - (phrase) OR -> content
1133     #                   -> title
1134     #                   -> authors
1135     #     - (keywords)  -> authors
1136     #                   -> motyw
1137     #                   -> tags
1138     #                   -> content
1139     #     """
1140         # queryreader = StringReader(query)
1141         # tokens = self.get_tokens(queryreader)
1142
1143         # top_level = BooleanQuery()
1144         # Should = BooleanClause.Occur.SHOULD
1145
1146         # phrase_level = BooleanQuery()
1147         # phrase_level.setBoost(1.3)
1148
1149         # p_content = self.make_phrase(tokens, joined=True)
1150         # p_title = self.make_phrase(tokens, 'title')
1151         # p_author = self.make_phrase(tokens, 'author')
1152
1153         # phrase_level.add(BooleanClause(p_content, Should))
1154         # phrase_level.add(BooleanClause(p_title, Should))
1155         # phrase_level.add(BooleanClause(p_author, Should))
1156
1157         # kw_level = BooleanQuery()
1158
1159         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1160         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1161         # kw_level.add(j_themes, Should)
1162         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1163         # j_con = self.make_term_query(tokens, joined=True)
1164         # kw_level.add(j_con, Should)
1165
1166         # top_level.add(BooleanClause(phrase_level, Should))
1167         # top_level.add(BooleanClause(kw_level, Should))
1168
1169         # return None
1170
1171     def get_snippets(self, scoreDoc, query, field='content'):
1172         """
1173         Returns a snippet for found scoreDoc.
1174         """
1175         htmlFormatter = SimpleHTMLFormatter()
1176         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1177
1178         stored = self.searcher.doc(scoreDoc.doc)
1179
1180         position = stored.get('snippets_position')
1181         length = stored.get('snippets_length')
1182         if position is None or length is None:
1183             return None
1184         # locate content.
1185         snippets = Snippets(stored.get('book_id')).open()
1186         try:
1187             text = snippets.get((int(position),
1188                                  int(length)))
1189         finally:
1190             snippets.close()
1191
1192         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1193         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1194         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1195
1196         return snip
1197
1198     @staticmethod
1199     def enum_to_array(enum):
1200         """
1201         Converts a lucene TermEnum to array of Terms, suitable for
1202         addition to queries
1203         """
1204         terms = []
1205
1206         while True:
1207             t = enum.term()
1208             if t:
1209                 terms.append(t)
1210             if not enum.next(): break
1211
1212         if terms:
1213             return JArray('object')(terms, Term)
1214
1215     def search_tags(self, query, filter=None, max_results=40):
1216         """
1217         Search for Tag objects using query.
1218         """
1219         tops = self.searcher.search(query, filter, max_results)
1220
1221         tags = []
1222         for found in tops.scoreDocs:
1223             doc = self.searcher.doc(found.doc)
1224             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1225             tags.append(tag)
1226             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1227
1228         return tags
1229
1230     def search_books(self, query, filter=None, max_results=10):
1231         """
1232         Searches for Book objects using query
1233         """
1234         bks = []
1235         tops = self.searcher.search(query, filter, max_results)
1236         for found in tops.scoreDocs:
1237             doc = self.searcher.doc(found.doc)
1238             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1239         return bks
1240
1241     def create_prefix_phrase(self, toks, field):
1242         q = MultiPhraseQuery()
1243         for i in range(len(toks)):
1244             t = Term(field, toks[i])
1245             if i == len(toks) - 1:
1246                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1247                 if pterms:
1248                     q.add(pterms)
1249                 else:
1250                     q.add(t)
1251             else:
1252                 q.add(t)
1253         return q
1254
1255     @staticmethod
1256     def term_filter(term, inverse=False):
1257         only_term = TermsFilter()
1258         only_term.addTerm(term)
1259
1260         if inverse:
1261             neg = BooleanFilter()
1262             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1263             only_term = neg
1264
1265         return only_term
1266
1267     def hint_tags(self, string, max_results=50):
1268         """
1269         Return auto-complete hints for tags
1270         using prefix search.
1271         """
1272         toks = self.get_tokens(string, field='SIMPLE')
1273         top = BooleanQuery()
1274
1275         for field in ['tag_name', 'tag_name_pl']:
1276             q = self.create_prefix_phrase(toks, field)
1277             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1278
1279         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1280
1281         return self.search_tags(top, no_book_cat, max_results=max_results)
1282
1283     def hint_books(self, string, max_results=50):
1284         """
1285         Returns auto-complete hints for book titles
1286         Because we do not index 'pseudo' title-tags.
1287         Prefix search.
1288         """
1289         toks = self.get_tokens(string, field='SIMPLE')
1290
1291         q = self.create_prefix_phrase(toks, 'title')
1292
1293         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1294
1295     @staticmethod
1296     def chain_filters(filters, op=ChainedFilter.AND):
1297         """
1298         Chains a filter list together
1299         """
1300         filters = filter(lambda x: x is not None, filters)
1301         if not filters or filters is []:
1302             return None
1303         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1304         return chf
1305
1306     def filtered_categories(self, tags):
1307         """
1308         Return a list of tag categories, present in tags list.
1309         """
1310         cats = {}
1311         for t in tags:
1312             cats[t.category] = True
1313         return cats.keys()
1314
1315     def hint(self):
1316         return Hint(self)