apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from multiprocessing.pool import ThreadPool
  31 from threading import current_thread
  32 import atexit
  33 import traceback
  34
  35
  36 class WLAnalyzer(PerFieldAnalyzerWrapper):
  37     def __init__(self):
  38         polish = PolishAnalyzer(Version.LUCENE_34)
  39         #        polish_gap.setPositionIncrementGap(999)
  40
  41         simple = SimpleAnalyzer(Version.LUCENE_34)
  42         #        simple_gap.setPositionIncrementGap(999)
  43
  44         keyword = KeywordAnalyzer(Version.LUCENE_34)
  45
  46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  47
  48         PerFieldAnalyzerWrapper.__init__(self, polish)
  49
  50         self.addAnalyzer("tags", simple)
  51         self.addAnalyzer("technical_editors", simple)
  52         self.addAnalyzer("editors", simple)
  53         self.addAnalyzer("url", keyword)
  54         self.addAnalyzer("source_url", keyword)
  55         self.addAnalyzer("source_name", simple)
  56         self.addAnalyzer("publisher", simple)
  57         self.addAnalyzer("authors", simple)
  58         self.addAnalyzer("title", simple)
  59
  60         self.addAnalyzer("is_book", keyword)
  61         # shouldn't the title have two forms? _pl and simple?
  62
  63         self.addAnalyzer("themes", simple)
  64         self.addAnalyzer("themes_pl", polish)
  65
  66         self.addAnalyzer("tag_name", simple)
  67         self.addAnalyzer("tag_name_pl", polish)
  68
  69         self.addAnalyzer("translators", simple)
  70
  71         self.addAnalyzer("KEYWORD", keyword)
  72         self.addAnalyzer("SIMPLE", simple)
  73         self.addAnalyzer("POLISH", polish)
  74
  75
  76 class IndexStore(object):
  77     """
  78     Provides access to search index.
  79
  80     self.store - lucene index directory
  81     """
  82     def __init__(self):
  83         self.make_index_dir()
  84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  85
  86     def make_index_dir(self):
  87         try:
  88             os.makedirs(settings.SEARCH_INDEX)
  89         except OSError as exc:
  90             if exc.errno == errno.EEXIST:
  91                 pass
  92             else: raise
  93
  94
  95 class IndexChecker(IndexStore):
  96     def __init__(self):
  97         IndexStore.__init__(self)
  98
  99     def check(self):
 100         checker = CheckIndex(self.store)
 101         status = checker.checkIndex()
 102         return status
 103
 104
 105 class Snippets(object):
 106     """
 107     This class manages snippet files for indexed object (book)
 108     the snippets are concatenated together, and their positions and
 109     lengths are kept in lucene index fields.
 110     """
 111     SNIPPET_DIR = "snippets"
 112
 113     def __init__(self, book_id):
 114         try:
 115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 116         except OSError as exc:
 117             if exc.errno == errno.EEXIST:
 118                 pass
 119             else: raise
 120         self.book_id = book_id
 121         self.file = None
 122
 123     def open(self, mode='r'):
 124         """
 125         Open the snippet file. Call .close() afterwards.
 126         """
 127         if not 'b' in mode:
 128             mode += 'b'
 129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 130         self.position = 0
 131         return self
 132
 133     def add(self, snippet):
 134         """
 135         Append a snippet (unicode) to the snippet file.
 136         Return a (position, length) tuple
 137         """
 138         txt = snippet.encode('utf-8')
 139         l = len(txt)
 140         self.file.write(txt)
 141         pos = (self.position, l)
 142         self.position += l
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad',
 268         ]
 269
 270     ignore_content_tags = [
 271         'uwaga', 'extra',
 272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 273         'didaskalia',
 274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 275         ]
 276
 277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 278
 279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 280
 281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 282
 283     def extract_metadata(self, book, book_info=None):
 284         """
 285         Extract metadata from book and returns a map of fields keyed by fieldname
 286         """
 287         fields = {}
 288
 289         if book_info is None:
 290             book_info = dcparser.parse(open(book.xml_file.path))
 291
 292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 295
 296         # validator, name
 297         for field in dcparser.BookInfo.FIELDS:
 298             if hasattr(book_info, field.name):
 299                 if not getattr(book_info, field.name):
 300                     continue
 301                 # since no type information is available, we use validator
 302                 type_indicator = field.validator
 303                 if type_indicator == dcparser.as_unicode:
 304                     s = getattr(book_info, field.name)
 305                     if field.multiple:
 306                         s = ', '.join(s)
 307                     try:
 308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 309                     except JavaError as je:
 310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 311                 elif type_indicator == dcparser.as_person:
 312                     p = getattr(book_info, field.name)
 313                     if isinstance(p, dcparser.Person):
 314                         persons = unicode(p)
 315                     else:
 316                         persons = ', '.join(map(unicode, p))
 317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 318                 elif type_indicator == dcparser.as_date:
 319                     dt = getattr(book_info, field.name)
 320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 322
 323         # get published date
 324         source = book_info.source_name
 325         match = self.published_date_re.search(source)
 326         print("published date is %s %s" % (match, match is not None and match.groups()))
 327         if match is not None:
 328             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
 329
 330         return fields
 331
 332     def add_gaps(self, fields, fieldname):
 333         """
 334         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 335         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 336         """
 337         def gap():
 338             while True:
 339                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 340         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 341
 342     def get_master(self, root):
 343         """
 344         Returns the first master tag from an etree.
 345         """
 346         for master in root.iter():
 347             if master.tag in self.master_tags:
 348                 return master
 349
 350     def index_content(self, book, book_fields=[]):
 351         """
 352         Walks the book XML and extract content from it.
 353         Adds parts for each header tag and for each fragment.
 354         """
 355         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 356         root = wld.edoc.getroot()
 357
 358         master = self.get_master(root)
 359         if master is None:
 360             return []
 361
 362         def walker(node, ignore_tags=[]):
 363             yield node, None
 364             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
 365                 for b, e in walker(child):
 366                     yield b, e
 367             yield None, node
 368             return
 369
 370         def fix_format(text):
 371             #            separator = [u" ", u"\t", u".", u";", u","]
 372             if isinstance(text, list):
 373                 # need to join it first
 374                 text = filter(lambda s: s is not None, content)
 375                 text = u' '.join(text)
 376                 # for i in range(len(text)):
 377                 #     if i > 0:
 378                 #         if text[i][0] not in separator\
 379                 #             and text[i - 1][-1] not in separator:
 380                 #          text.insert(i, u" ")
 381
 382             return re.sub("(?m)/$", "", text)
 383
 384         def add_part(snippets, **fields):
 385             doc = self.create_book_doc(book)
 386             for f in book_fields:
 387                 doc.add(f)
 388
 389             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 390             doc.add(NumericField("header_span", Field.Store.YES, True)\
 391                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 392             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 393
 394             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 395                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 396
 397             snip_pos = snippets.add(fields["content"])
 398             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 399             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 400
 401             if 'fragment_anchor' in fields:
 402                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 403                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 404
 405             if 'themes' in fields:
 406                 themes, themes_pl = zip(*[
 407                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 408                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 409                      for theme in fields['themes']])
 410
 411                 themes = self.add_gaps(themes, 'themes')
 412                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 413
 414                 for t in themes:
 415                     doc.add(t)
 416                 for t in themes_pl:
 417                     doc.add(t)
 418
 419             return doc
 420
 421         def give_me_utf8(s):
 422             if isinstance(s, unicode):
 423                 return s.encode('utf-8')
 424             else:
 425                 return s
 426
 427         fragments = {}
 428         snippets = Snippets(book.id).open('w')
 429         position = 0
 430         try:
 431             for header in list(master):
 432
 433                 if header.tag in self.skip_header_tags:
 434                     continue
 435                 if header.tag is etree.Comment:
 436                     continue
 437
 438                 # section content
 439                 content = []
 440                 footnote = None
 441
 442                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
 443                     # handle footnotes
 444                     if start is not None and start.tag in self.footnote_tags:
 445                         footnote = ' '.join(start.itertext())
 446                     elif end is not None and footnote is not None and end.tag in self.footnote_tags:
 447                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 448                                        content=footnote)
 449
 450                         self.index.addDocument(doc)
 451
 452                         footnote = None
 453
 454                     # handle fragments and themes.
 455                     if start is not None and start.tag == 'begin':
 456                         fid = start.attrib['id'][1:]
 457                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 458
 459                     elif start is not None and start.tag == 'motyw':
 460                         fid = start.attrib['id'][1:]
 461                         if start.text is not None:
 462                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 463
 464                     elif start is not None and start.tag == 'end':
 465                         fid = start.attrib['id'][1:]
 466                         if fid not in fragments:
 467                             continue  # a broken <end> node, skip it
 468                                       #                        import pdb; pdb.set_trace()
 469                         frag = fragments[fid]
 470                         if frag['themes'] == []:
 471                             continue  # empty themes list.
 472                         del fragments[fid]
 473
 474                         doc = add_part(snippets,
 475                                        header_type=frag['start_header'],
 476                                        header_index=frag['start_section'],
 477                                        header_span=position - frag['start_section'] + 1,
 478                                        fragment_anchor=fid,
 479                                        content=fix_format(frag['content']),
 480                                        themes=frag['themes'])
 481
 482                         self.index.addDocument(doc)
 483
 484                         # Collect content.
 485                     elif start is not None:
 486                         for frag in fragments.values():
 487                             frag['content'].append(start.text)
 488                         content.append(start.text)
 489                     elif end is not None:
 490                         for frag in fragments.values():
 491                             frag['content'].append(end.tail)
 492                         content.append(end.tail)
 493
 494                         # in the end, add a section text.
 495                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 496                                content=fix_format(content))
 497
 498                 self.index.addDocument(doc)
 499                 position += 1
 500
 501         finally:
 502             snippets.close()
 503
 504
 505 def log_exception_wrapper(f):
 506     def _wrap(*a):
 507         try:
 508             f(*a)
 509         except Exception, e:
 510             print("Error in indexing thread: %s" % e)
 511             traceback.print_exc()
 512             raise e
 513     return _wrap
 514
 515
 516 class ReusableIndex(Index):
 517     """
 518     Works like index, but does not close/optimize Lucene index
 519     until program exit (uses atexit hook).
 520     This is usefull for importbooks command.
 521
 522     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 523     """
 524     index = None
 525
 526     def open(self, analyzer=None, threads=4):
 527         if ReusableIndex.index is not None:
 528             self.index = ReusableIndex.index
 529         else:
 530             print("opening index")
 531             Index.open(self, analyzer)
 532             ReusableIndex.index = self.index
 533             atexit.register(ReusableIndex.close_reusable)
 534
 535     # def index_book(self, *args, **kw):
 536     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 537     #     ReusableIndex.pool_jobs.append(job)
 538
 539     @staticmethod
 540     def close_reusable():
 541         if ReusableIndex.index is not None:
 542             ReusableIndex.index.optimize()
 543             ReusableIndex.index.close()
 544             ReusableIndex.index = None
 545
 546     def close(self):
 547         pass
 548
 549
 550 class JoinSearch(object):
 551     """
 552     This mixin could be used to handle block join queries.
 553     (currently unused)
 554     """
 555     def __init__(self, *args, **kw):
 556         super(JoinSearch, self).__init__(*args, **kw)
 557
 558     def wrapjoins(self, query, fields=[]):
 559         """
 560         This functions modifies the query in a recursive way,
 561         so Term and Phrase Queries contained, which match
 562         provided fields are wrapped in a BlockJoinQuery,
 563         and so delegated to children documents.
 564         """
 565         if BooleanQuery.instance_(query):
 566             qs = BooleanQuery.cast_(query)
 567             for clause in qs:
 568                 clause = BooleanClause.cast_(clause)
 569                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 570             return qs
 571         else:
 572             termset = HashSet()
 573             query.extractTerms(termset)
 574             for t in termset:
 575                 t = Term.cast_(t)
 576                 if t.field() not in fields:
 577                     return query
 578             return BlockJoinQuery(query, self.parent_filter,
 579                                   BlockJoinQuery.ScoreMode.Total)
 580
 581     def bsearch(self, query, max_results=50):
 582         q = self.query(query)
 583         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 584
 585         tops = self.searcher.search(bjq, max_results)
 586         bks = []
 587         for found in tops.scoreDocs:
 588             doc = self.searcher.doc(found.doc)
 589             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 590         return (bks, tops.totalHits)
 591
 592
 593 class SearchResult(object):
 594     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 595         if tokens_cache is None: tokens_cache = {}
 596
 597         if score:
 598             self._score = score
 599         else:
 600             self._score = scoreDocs.score
 601
 602         self.boost = 1.0
 603
 604         self._hits = []
 605         self._processed_hits = None  # processed hits
 606
 607         stored = search.searcher.doc(scoreDocs.doc)
 608         self.book_id = int(stored.get("book_id"))
 609
 610         header_type = stored.get("header_type")
 611         if not header_type:
 612             return
 613
 614         sec = (header_type, int(stored.get("header_index")))
 615         header_span = stored.get('header_span')
 616         header_span = header_span is not None and int(header_span) or 1
 617
 618         fragment = stored.get("fragment_anchor")
 619
 620         pd = stored.get("published_date")
 621         if pd is None:
 622             print "published_date is none for book %d" % self.book_id
 623             pd = 0
 624         self.published_date = int(pd)
 625
 626         if snippets:
 627             snippets = snippets.replace("/\n", "\n")
 628         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 629
 630         self._hits.append(hit)
 631
 632         self.search = search
 633         self.searched = searched
 634         self.tokens_cache = tokens_cache
 635
 636     @property
 637     def score(self):
 638         return self._score * self.boost
 639
 640     def merge(self, other):
 641         if self.book_id != other.book_id:
 642             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 643         self._hits += other._hits
 644         if other.score > self.score:
 645             self.score = other.score
 646         return self
 647
 648     def get_book(self):
 649         return catalogue.models.Book.objects.get(id=self.book_id)
 650
 651     book = property(get_book)
 652
 653     @property
 654     def hits(self):
 655         if self._processed_hits is not None:
 656             return self._processed_hits
 657
 658         POSITION = 0
 659         FRAGMENT = 1
 660         POSITION_INDEX = 1
 661         POSITION_SPAN = 2
 662         SCORE = 2
 663         OTHER = 3
 664
 665         # to sections and fragments
 666         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 667         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 668         sect = filter(lambda s: 0 == len(filter(
 669             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 670             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 671             frags)), sect)
 672
 673         hits = []
 674
 675         # remove duplicate fragments
 676         fragments = {}
 677         for f in frags:
 678             fid = f[FRAGMENT]
 679             if fid in fragments:
 680                 if fragments[fid][SCORE] >= f[SCORE]:
 681                     continue
 682             fragments[fid] = f
 683         frags = fragments.values()
 684
 685         # remove duplicate sections
 686         sections = {}
 687
 688         for s in sect:
 689             si = s[POSITION][POSITION_INDEX]
 690             # skip existing
 691             if si in sections:
 692                 if sections[si]['score'] >= s[SCORE]:
 693                     continue
 694
 695             m = {'score': s[SCORE],
 696                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 697                  }
 698             m.update(s[OTHER])
 699             sections[si] = m
 700
 701         hits = sections.values()
 702
 703         for f in frags:
 704             try:
 705                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 706             except catalogue.models.Fragment.DoesNotExist:
 707                 # stale index
 708                 continue
 709
 710             # Figure out if we were searching for a token matching some word in theme name.
 711             themes = frag.tags.filter(category='theme')
 712             themes_hit = []
 713             if self.searched is not None:
 714                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 715                 for theme in themes:
 716                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 717                     print "THEME HIT: %s in %s" % (tokens, name_tokens)
 718                     for t in tokens:
 719                         if t in name_tokens:
 720                             if not theme in themes_hit:
 721                                 themes_hit.append(theme)
 722                             break
 723
 724             m = {'score': f[SCORE],
 725                  'fragment': frag,
 726                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 727                  'themes': themes,
 728                  'themes_hit': themes_hit
 729                  }
 730             m.update(f[OTHER])
 731             hits.append(m)
 732
 733         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 734
 735         self._processed_hits = hits
 736
 737         return hits
 738
 739     def __unicode__(self):
 740         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 741
 742     @staticmethod
 743     def aggregate(*result_lists):
 744         books = {}
 745         for rl in result_lists:
 746             for r in rl:
 747                 if r.book_id in books:
 748                     books[r.book_id].merge(r)
 749                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 750                 else:
 751                     books[r.book_id] = r
 752         return books.values()
 753
 754     def __cmp__(self, other):
 755         c = cmp(self.score, other.score)
 756         if c == 0:
 757             # this is inverted, because earlier date is better
 758             return cmp(other.published_date, self.published_date)
 759         else:
 760             return c
 761
 762
 763 class Hint(object):
 764     """
 765     Given some hint information (information we already know about)
 766     our search target - like author, title (specific book), epoch, genre, kind
 767     we can narrow down search using filters.
 768     """
 769     def __init__(self, search):
 770         """
 771         Accepts a Searcher instance.
 772         """
 773         self.search = search
 774         self.book_tags = {}
 775         self.part_tags = []
 776         self._books = []
 777
 778     def books(self, *books):
 779         """
 780         Give a hint that we search these books.
 781         """
 782         self._books = books
 783
 784     def tags(self, tags):
 785         """
 786         Give a hint that these Tag objects (a list of)
 787         is necessary.
 788         """
 789         for t in tags:
 790             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 791                 lst = self.book_tags.get(t.category, [])
 792                 lst.append(t)
 793                 self.book_tags[t.category] = lst
 794             if t.category in ['theme', 'theme_pl']:
 795                 self.part_tags.append(t)
 796
 797     def tag_filter(self, tags, field='tags'):
 798         """
 799         Given a lsit of tags and an optional field (but they are normally in tags field)
 800         returns a filter accepting only books with specific tags.
 801         """
 802         q = BooleanQuery()
 803
 804         for tag in tags:
 805             toks = self.search.get_tokens(tag.name, field=field)
 806             tag_phrase = PhraseQuery()
 807             for tok in toks:
 808                 tag_phrase.add(Term(field, tok))
 809             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 810
 811         return QueryWrapperFilter(q)
 812
 813     def book_filter(self):
 814         """
 815         Filters using book tags (all tag kinds except a theme)
 816         """
 817         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 818         if tags:
 819             return self.tag_filter(tags)
 820         else:
 821             return None
 822
 823     def part_filter(self):
 824         """
 825         This filter can be used to look for book parts.
 826         It filters on book id and/or themes.
 827         """
 828         fs = []
 829         if self.part_tags:
 830             fs.append(self.tag_filter(self.part_tags, field='themes'))
 831
 832         if self._books != []:
 833             bf = BooleanFilter()
 834             for b in self._books:
 835                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 836                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 837             fs.append(bf)
 838
 839         return Search.chain_filters(fs)
 840
 841     def should_search_for_book(self):
 842         return self._books == []
 843
 844     def just_search_in(self, all):
 845         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 846         some = []
 847         for field in all:
 848             if field == 'authors' and 'author' in self.book_tags:
 849                 continue
 850             if field == 'title' and self._books != []:
 851                 continue
 852             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 853                 continue
 854             some.append(field)
 855         return some
 856
 857
 858 class Search(IndexStore):
 859     """
 860     Search facilities.
 861     """
 862     def __init__(self, default_field="content"):
 863         IndexStore.__init__(self)
 864         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 865         # self.analyzer = WLAnalyzer()
 866         self.searcher = IndexSearcher(self.store, True)
 867         self.parser = QueryParser(Version.LUCENE_34, default_field,
 868                                   self.analyzer)
 869
 870         self.parent_filter = TermsFilter()
 871         self.parent_filter.addTerm(Term("is_book", "true"))
 872
 873     def query(self, query):
 874         """Parse query in default Lucene Syntax. (for humans)
 875         """
 876         return self.parser.parse(query)
 877
 878     def simple_search(self, query, max_results=50):
 879         """Runs a query for books using lucene syntax. (for humans)
 880         Returns (books, total_hits)
 881         """
 882
 883         tops = self.searcher.search(self.query(query), max_results)
 884         bks = []
 885         for found in tops.scoreDocs:
 886             doc = self.searcher.doc(found.doc)
 887             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 888         return (bks, tops.totalHits)
 889
 890     def get_tokens(self, searched, field='content', cached=None):
 891         """returns tokens analyzed by a proper (for a field) analyzer
 892         argument can be: StringReader, string/unicode, or tokens. In the last case
 893         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 894         """
 895         if cached is not None and field in cached:
 896             return cached[field]
 897
 898         if isinstance(searched, str) or isinstance(searched, unicode):
 899             searched = StringReader(searched)
 900         elif isinstance(searched, list):
 901             return searched
 902
 903         searched.reset()
 904         tokens = self.analyzer.reusableTokenStream(field, searched)
 905         toks = []
 906         while tokens.incrementToken():
 907             cta = tokens.getAttribute(CharTermAttribute.class_)
 908             toks.append(cta.toString())
 909
 910         if cached is not None:
 911             cached[field] = toks
 912
 913         return toks
 914
 915     def fuzziness(self, fuzzy):
 916         """Helper method to sanitize fuzziness"""
 917         if not fuzzy:
 918             return None
 919         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 920             return fuzzy
 921         else:
 922             return 0.5
 923
 924     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 925         """
 926         Return a PhraseQuery with a series of tokens.
 927         """
 928         if fuzzy:
 929             phrase = MultiPhraseQuery()
 930             for t in tokens:
 931                 term = Term(field, t)
 932                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 933                 fuzzterms = []
 934
 935                 while True:
 936                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 937                     ft = fuzzterm.term()
 938                     if ft:
 939                         fuzzterms.append(ft)
 940                     if not fuzzterm.next(): break
 941                 if fuzzterms:
 942                     phrase.add(JArray('object')(fuzzterms, Term))
 943                 else:
 944                     phrase.add(term)
 945         else:
 946             phrase = PhraseQuery()
 947             phrase.setSlop(slop)
 948             for t in tokens:
 949                 term = Term(field, t)
 950                 phrase.add(term)
 951         return phrase
 952
 953     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 954         """
 955         Returns term queries joined by boolean query.
 956         modal - applies to boolean query
 957         fuzzy - should the query by fuzzy.
 958         """
 959         q = BooleanQuery()
 960         for t in tokens:
 961             term = Term(field, t)
 962             if fuzzy:
 963                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 964             else:
 965                 term = TermQuery(term)
 966             q.add(BooleanClause(term, modal))
 967         return q
 968
 969     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 970                       filters=None, tokens_cache=None, boost=None, snippets=False):
 971         if filters is None: filters = []
 972         if tokens_cache is None: tokens_cache = {}
 973
 974         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 975
 976         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy)
 977         if book:
 978             filters.append(self.term_filter(Term('is_book', 'true')))
 979         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 980
 981         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 982
 983     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 984                     filters=None, tokens_cache=None, boost=None):
 985         if filters is None: filters = []
 986         if tokens_cache is None: tokens_cache = {}
 987
 988         if book:
 989             filters.append(self.term_filter(Term('is_book', 'true')))
 990
 991         query = BooleanQuery()
 992
 993         for fld in fields:
 994             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 995
 996             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 997                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 998
 999         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1000
1001         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1002                              snippets=self.get_snippets(found, query)) for found in top.scoreDocs]
1003
1004     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1005         """
1006         Search for perfect book matches. Just see if the query matches with some author or title,
1007         taking hints into account.
1008         """
1009         fields_to_search = ['authors', 'title']
1010         only_in = None
1011         if hint:
1012             if not hint.should_search_for_book():
1013                 return []
1014             fields_to_search = hint.just_search_in(fields_to_search)
1015             only_in = hint.book_filter()
1016
1017         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1018
1019         books = []
1020         for q in qrys:
1021             top = self.searcher.search(q,
1022                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1023                 max_results)
1024             for found in top.scoreDocs:
1025                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1026         return books
1027
1028     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1029         fields_to_search = ['tags', 'authors', 'title']
1030
1031         only_in = None
1032         if hint:
1033             if not hint.should_search_for_book():
1034                 return []
1035             fields_to_search = hint.just_search_in(fields_to_search)
1036             only_in = hint.book_filter()
1037
1038         tokens = self.get_tokens(searched, field='SIMPLE')
1039
1040         q = BooleanQuery()
1041
1042         for fld in fields_to_search:
1043             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1044                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1045
1046         books = []
1047         top = self.searcher.search(q,
1048                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1049             max_results)
1050         for found in top.scoreDocs:
1051             books.append(SearchResult(self, found, how_found="search_book"))
1052
1053         return books
1054
1055     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1056         """
1057         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1058         some part/fragment of the book.
1059         """
1060         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1061
1062         flt = None
1063         if hint:
1064             flt = hint.part_filter()
1065
1066         books = []
1067         for q in qrys:
1068             top = self.searcher.search(q,
1069                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1070                                                            flt]),
1071                                        max_results)
1072             for found in top.scoreDocs:
1073                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1074
1075         return books
1076
1077     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1078         """
1079         Tries to use search terms to match different fields of book (or its parts).
1080         E.g. one word can be an author survey, another be a part of the title, and the rest
1081         are some words from third chapter.
1082         """
1083         if tokens_cache is None: tokens_cache = {}
1084         books = []
1085         only_in = None
1086
1087         if hint:
1088             only_in = hint.part_filter()
1089
1090         # content only query : themes x content
1091         q = BooleanQuery()
1092
1093         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1094         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1095
1096         # only search in themes when we do not already filter by themes
1097         if hint is None or hint.just_search_in(['themes']) != []:
1098             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1099                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1100
1101         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1102                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1103
1104         topDocs = self.searcher.search(q, only_in, max_results)
1105         for found in topDocs.scoreDocs:
1106             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1107             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1108
1109         # query themes/content x author/title/tags
1110         q = BooleanQuery()
1111         in_content = BooleanQuery()
1112         in_meta = BooleanQuery()
1113
1114         for fld in ['themes_pl', 'content']:
1115             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1116
1117         for fld in ['tags', 'authors', 'title']:
1118             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1119
1120         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1121         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1122
1123         topDocs = self.searcher.search(q, only_in, max_results)
1124         for found in topDocs.scoreDocs:
1125             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1126             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1127
1128         return books
1129
1130     # def multisearch(self, query, max_results=50):
1131     #     """
1132     #     Search strategy:
1133     #     - (phrase) OR -> content
1134     #                   -> title
1135     #                   -> authors
1136     #     - (keywords)  -> authors
1137     #                   -> motyw
1138     #                   -> tags
1139     #                   -> content
1140     #     """
1141         # queryreader = StringReader(query)
1142         # tokens = self.get_tokens(queryreader)
1143
1144         # top_level = BooleanQuery()
1145         # Should = BooleanClause.Occur.SHOULD
1146
1147         # phrase_level = BooleanQuery()
1148         # phrase_level.setBoost(1.3)
1149
1150         # p_content = self.make_phrase(tokens, joined=True)
1151         # p_title = self.make_phrase(tokens, 'title')
1152         # p_author = self.make_phrase(tokens, 'author')
1153
1154         # phrase_level.add(BooleanClause(p_content, Should))
1155         # phrase_level.add(BooleanClause(p_title, Should))
1156         # phrase_level.add(BooleanClause(p_author, Should))
1157
1158         # kw_level = BooleanQuery()
1159
1160         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1161         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1162         # kw_level.add(j_themes, Should)
1163         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1164         # j_con = self.make_term_query(tokens, joined=True)
1165         # kw_level.add(j_con, Should)
1166
1167         # top_level.add(BooleanClause(phrase_level, Should))
1168         # top_level.add(BooleanClause(kw_level, Should))
1169
1170         # return None
1171
1172     def get_snippets(self, scoreDoc, query, field='content'):
1173         """
1174         Returns a snippet for found scoreDoc.
1175         """
1176         htmlFormatter = SimpleHTMLFormatter()
1177         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1178
1179         stored = self.searcher.doc(scoreDoc.doc)
1180
1181         position = stored.get('snippets_position')
1182         length = stored.get('snippets_length')
1183         if position is None or length is None:
1184             return None
1185         # locate content.
1186         snippets = Snippets(stored.get('book_id')).open()
1187         try:
1188             text = snippets.get((int(position),
1189                                  int(length)))
1190         finally:
1191             snippets.close()
1192
1193         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1194         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1195         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1196
1197         return snip
1198
1199     @staticmethod
1200     def enum_to_array(enum):
1201         """
1202         Converts a lucene TermEnum to array of Terms, suitable for
1203         addition to queries
1204         """
1205         terms = []
1206
1207         while True:
1208             t = enum.term()
1209             if t:
1210                 terms.append(t)
1211             if not enum.next(): break
1212
1213         if terms:
1214             return JArray('object')(terms, Term)
1215
1216     def search_tags(self, query, filter=None, max_results=40):
1217         """
1218         Search for Tag objects using query.
1219         """
1220         tops = self.searcher.search(query, filter, max_results)
1221
1222         tags = []
1223         for found in tops.scoreDocs:
1224             doc = self.searcher.doc(found.doc)
1225             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1226             tags.append(tag)
1227             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1228
1229         return tags
1230
1231     def search_books(self, query, filter=None, max_results=10):
1232         """
1233         Searches for Book objects using query
1234         """
1235         bks = []
1236         tops = self.searcher.search(query, filter, max_results)
1237         for found in tops.scoreDocs:
1238             doc = self.searcher.doc(found.doc)
1239             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1240         return bks
1241
1242     def create_prefix_phrase(self, toks, field):
1243         q = MultiPhraseQuery()
1244         for i in range(len(toks)):
1245             t = Term(field, toks[i])
1246             if i == len(toks) - 1:
1247                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1248                 if pterms:
1249                     q.add(pterms)
1250                 else:
1251                     q.add(t)
1252             else:
1253                 q.add(t)
1254         return q
1255
1256     @staticmethod
1257     def term_filter(term, inverse=False):
1258         only_term = TermsFilter()
1259         only_term.addTerm(term)
1260
1261         if inverse:
1262             neg = BooleanFilter()
1263             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1264             only_term = neg
1265
1266         return only_term
1267
1268     def hint_tags(self, string, max_results=50):
1269         """
1270         Return auto-complete hints for tags
1271         using prefix search.
1272         """
1273         toks = self.get_tokens(string, field='SIMPLE')
1274         top = BooleanQuery()
1275
1276         for field in ['tag_name', 'tag_name_pl']:
1277             q = self.create_prefix_phrase(toks, field)
1278             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1279
1280         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1281
1282         return self.search_tags(top, no_book_cat, max_results=max_results)
1283
1284     def hint_books(self, string, max_results=50):
1285         """
1286         Returns auto-complete hints for book titles
1287         Because we do not index 'pseudo' title-tags.
1288         Prefix search.
1289         """
1290         toks = self.get_tokens(string, field='SIMPLE')
1291
1292         q = self.create_prefix_phrase(toks, 'title')
1293
1294         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1295
1296     @staticmethod
1297     def chain_filters(filters, op=ChainedFilter.AND):
1298         """
1299         Chains a filter list together
1300         """
1301         filters = filter(lambda x: x is not None, filters)
1302         if not filters or filters is []:
1303             return None
1304         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1305         return chf
1306
1307     def filtered_categories(self, tags):
1308         """
1309         Return a list of tag categories, present in tags list.
1310         """
1311         cats = {}
1312         for t in tags:
1313             cats[t.category] = True
1314         return cats.keys()
1315
1316     def hint(self):
1317         return Hint(self)