apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from multiprocessing.pool import ThreadPool
  31 from threading import current_thread
  32 import atexit
  33 import traceback
  34
  35
  36 class WLAnalyzer(PerFieldAnalyzerWrapper):
  37     def __init__(self):
  38         polish = PolishAnalyzer(Version.LUCENE_34)
  39         #        polish_gap.setPositionIncrementGap(999)
  40
  41         simple = SimpleAnalyzer(Version.LUCENE_34)
  42         #        simple_gap.setPositionIncrementGap(999)
  43
  44         keyword = KeywordAnalyzer(Version.LUCENE_34)
  45
  46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  47
  48         PerFieldAnalyzerWrapper.__init__(self, polish)
  49
  50         self.addAnalyzer("tags", simple)
  51         self.addAnalyzer("technical_editors", simple)
  52         self.addAnalyzer("editors", simple)
  53         self.addAnalyzer("url", keyword)
  54         self.addAnalyzer("source_url", keyword)
  55         self.addAnalyzer("source_name", simple)
  56         self.addAnalyzer("publisher", simple)
  57         self.addAnalyzer("authors", simple)
  58         self.addAnalyzer("title", simple)
  59
  60         self.addAnalyzer("is_book", keyword)
  61         # shouldn't the title have two forms? _pl and simple?
  62
  63         self.addAnalyzer("themes", simple)
  64         self.addAnalyzer("themes_pl", polish)
  65
  66         self.addAnalyzer("tag_name", simple)
  67         self.addAnalyzer("tag_name_pl", polish)
  68
  69         self.addAnalyzer("translators", simple)
  70
  71         self.addAnalyzer("KEYWORD", keyword)
  72         self.addAnalyzer("SIMPLE", simple)
  73         self.addAnalyzer("POLISH", polish)
  74
  75
  76 class IndexStore(object):
  77     """
  78     Provides access to search index.
  79
  80     self.store - lucene index directory
  81     """
  82     def __init__(self):
  83         self.make_index_dir()
  84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  85
  86     def make_index_dir(self):
  87         try:
  88             os.makedirs(settings.SEARCH_INDEX)
  89         except OSError as exc:
  90             if exc.errno == errno.EEXIST:
  91                 pass
  92             else: raise
  93
  94
  95 class IndexChecker(IndexStore):
  96     def __init__(self):
  97         IndexStore.__init__(self)
  98
  99     def check(self):
 100         checker = CheckIndex(self.store)
 101         status = checker.checkIndex()
 102         return status
 103
 104
 105 class Snippets(object):
 106     """
 107     This class manages snippet files for indexed object (book)
 108     the snippets are concatenated together, and their positions and
 109     lengths are kept in lucene index fields.
 110     """
 111     SNIPPET_DIR = "snippets"
 112
 113     def __init__(self, book_id):
 114         try:
 115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 116         except OSError as exc:
 117             if exc.errno == errno.EEXIST:
 118                 pass
 119             else: raise
 120         self.book_id = book_id
 121         self.file = None
 122
 123     def open(self, mode='r'):
 124         """
 125         Open the snippet file. Call .close() afterwards.
 126         """
 127         if not 'b' in mode:
 128             mode += 'b'
 129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 130         self.position = 0
 131         return self
 132
 133     def add(self, snippet):
 134         """
 135         Append a snippet (unicode) to the snippet file.
 136         Return a (position, length) tuple
 137         """
 138         txt = snippet.encode('utf-8')
 139         l = len(txt)
 140         self.file.write(txt)
 141         pos = (self.position, l)
 142         self.position += l
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad',
 268         ]
 269
 270     ignore_content_tags = [
 271         'uwaga', 'extra',
 272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 273         'didaskalia',
 274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 275         ]
 276
 277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 278
 279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 280
 281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 282
 283     def extract_metadata(self, book, book_info=None):
 284         """
 285         Extract metadata from book and returns a map of fields keyed by fieldname
 286         """
 287         fields = {}
 288
 289         if book_info is None:
 290             book_info = dcparser.parse(open(book.xml_file.path))
 291
 292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 295
 296         # validator, name
 297         for field in dcparser.BookInfo.FIELDS:
 298             if hasattr(book_info, field.name):
 299                 if not getattr(book_info, field.name):
 300                     continue
 301                 # since no type information is available, we use validator
 302                 type_indicator = field.validator
 303                 if type_indicator == dcparser.as_unicode:
 304                     s = getattr(book_info, field.name)
 305                     if field.multiple:
 306                         s = ', '.join(s)
 307                     try:
 308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 309                     except JavaError as je:
 310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 311                 elif type_indicator == dcparser.as_person:
 312                     p = getattr(book_info, field.name)
 313                     if isinstance(p, dcparser.Person):
 314                         persons = unicode(p)
 315                     else:
 316                         persons = ', '.join(map(unicode, p))
 317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 318                 elif type_indicator == dcparser.as_date:
 319                     dt = getattr(book_info, field.name)
 320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 322
 323         # get published date
 324         source = book_info.source_name
 325         match = self.published_date_re.search(source)
 326         if match is not None:
 327             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
 328
 329         return fields
 330
 331     def add_gaps(self, fields, fieldname):
 332         """
 333         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 334         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 335         """
 336         def gap():
 337             while True:
 338                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 339         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 340
 341     def get_master(self, root):
 342         """
 343         Returns the first master tag from an etree.
 344         """
 345         for master in root.iter():
 346             if master.tag in self.master_tags:
 347                 return master
 348
 349     def index_content(self, book, book_fields=[]):
 350         """
 351         Walks the book XML and extract content from it.
 352         Adds parts for each header tag and for each fragment.
 353         """
 354         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 355         root = wld.edoc.getroot()
 356
 357         master = self.get_master(root)
 358         if master is None:
 359             return []
 360
 361         def walker(node, ignore_tags=[]):
 362             yield node, None
 363             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
 364                 for b, e in walker(child):
 365                     yield b, e
 366             yield None, node
 367             return
 368
 369         def fix_format(text):
 370             #            separator = [u" ", u"\t", u".", u";", u","]
 371             if isinstance(text, list):
 372                 # need to join it first
 373                 text = filter(lambda s: s is not None, content)
 374                 text = u' '.join(text)
 375                 # for i in range(len(text)):
 376                 #     if i > 0:
 377                 #         if text[i][0] not in separator\
 378                 #             and text[i - 1][-1] not in separator:
 379                 #          text.insert(i, u" ")
 380
 381             return re.sub("(?m)/$", "", text)
 382
 383         def add_part(snippets, **fields):
 384             doc = self.create_book_doc(book)
 385             for f in book_fields:
 386                 doc.add(f)
 387
 388             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 389             doc.add(NumericField("header_span", Field.Store.YES, True)\
 390                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 391             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 392
 393             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 394                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 395
 396             snip_pos = snippets.add(fields["content"])
 397             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 398             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 399
 400             if 'fragment_anchor' in fields:
 401                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 402                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 403
 404             if 'themes' in fields:
 405                 themes, themes_pl = zip(*[
 406                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 407                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 408                      for theme in fields['themes']])
 409
 410                 themes = self.add_gaps(themes, 'themes')
 411                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 412
 413                 for t in themes:
 414                     doc.add(t)
 415                 for t in themes_pl:
 416                     doc.add(t)
 417
 418             return doc
 419
 420         def give_me_utf8(s):
 421             if isinstance(s, unicode):
 422                 return s.encode('utf-8')
 423             else:
 424                 return s
 425
 426         fragments = {}
 427         snippets = Snippets(book.id).open('w')
 428         try:
 429             for header, position in zip(list(master), range(len(master))):
 430
 431                 if header.tag in self.skip_header_tags:
 432                     continue
 433                 if header.tag is etree.Comment:
 434                     continue
 435
 436                 # section content
 437                 content = []
 438                 footnote = None
 439
 440                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
 441                     # handle footnotes
 442                     # if start is not None and start.tag in self.footnote_tags:
 443                     #     footnote = ' '.join(start.itertext())
 444                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
 445                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
 446                     #                    content=footnote)
 447
 448                     #     self.index.addDocument(doc)
 449
 450                     #     footnote = None
 451
 452                     # handle fragments and themes.
 453                     if start is not None and start.tag == 'begin':
 454                         fid = start.attrib['id'][1:]
 455                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 456
 457                     elif start is not None and start.tag == 'motyw':
 458                         fid = start.attrib['id'][1:]
 459                         if start.text is not None:
 460                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 461
 462                     elif start is not None and start.tag == 'end':
 463                         fid = start.attrib['id'][1:]
 464                         if fid not in fragments:
 465                             continue  # a broken <end> node, skip it
 466                                       #                        import pdb; pdb.set_trace()
 467                         frag = fragments[fid]
 468                         if frag['themes'] == []:
 469                             continue  # empty themes list.
 470                         del fragments[fid]
 471
 472                         doc = add_part(snippets,
 473                                        header_type=frag['start_header'],
 474                                        header_index=frag['start_section'],
 475                                        header_span=position - frag['start_section'] + 1,
 476                                        fragment_anchor=fid,
 477                                        content=fix_format(frag['content']),
 478                                        themes=frag['themes'])
 479
 480                         self.index.addDocument(doc)
 481
 482                         # Collect content.
 483                     elif start is not None:
 484                         for frag in fragments.values():
 485                             frag['content'].append(start.text)
 486                         content.append(start.text)
 487                     elif end is not None:
 488                         for frag in fragments.values():
 489                             frag['content'].append(end.tail)
 490                         content.append(end.tail)
 491
 492                         # in the end, add a section text.
 493                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 494                                content=fix_format(content))
 495
 496                 self.index.addDocument(doc)
 497
 498         finally:
 499             snippets.close()
 500
 501
 502 def log_exception_wrapper(f):
 503     def _wrap(*a):
 504         try:
 505             f(*a)
 506         except Exception, e:
 507             print("Error in indexing thread: %s" % e)
 508             traceback.print_exc()
 509             raise e
 510     return _wrap
 511
 512
 513 class ReusableIndex(Index):
 514     """
 515     Works like index, but does not close/optimize Lucene index
 516     until program exit (uses atexit hook).
 517     This is usefull for importbooks command.
 518
 519     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 520     """
 521     index = None
 522
 523     def open(self, analyzer=None, threads=4):
 524         if ReusableIndex.index is not None:
 525             self.index = ReusableIndex.index
 526         else:
 527             print("opening index")
 528             Index.open(self, analyzer)
 529             ReusableIndex.index = self.index
 530             atexit.register(ReusableIndex.close_reusable)
 531
 532     # def index_book(self, *args, **kw):
 533     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 534     #     ReusableIndex.pool_jobs.append(job)
 535
 536     @staticmethod
 537     def close_reusable():
 538         if ReusableIndex.index is not None:
 539             ReusableIndex.index.optimize()
 540             ReusableIndex.index.close()
 541             ReusableIndex.index = None
 542
 543     def close(self):
 544         pass
 545
 546
 547 class JoinSearch(object):
 548     """
 549     This mixin could be used to handle block join queries.
 550     (currently unused)
 551     """
 552     def __init__(self, *args, **kw):
 553         super(JoinSearch, self).__init__(*args, **kw)
 554
 555     def wrapjoins(self, query, fields=[]):
 556         """
 557         This functions modifies the query in a recursive way,
 558         so Term and Phrase Queries contained, which match
 559         provided fields are wrapped in a BlockJoinQuery,
 560         and so delegated to children documents.
 561         """
 562         if BooleanQuery.instance_(query):
 563             qs = BooleanQuery.cast_(query)
 564             for clause in qs:
 565                 clause = BooleanClause.cast_(clause)
 566                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 567             return qs
 568         else:
 569             termset = HashSet()
 570             query.extractTerms(termset)
 571             for t in termset:
 572                 t = Term.cast_(t)
 573                 if t.field() not in fields:
 574                     return query
 575             return BlockJoinQuery(query, self.parent_filter,
 576                                   BlockJoinQuery.ScoreMode.Total)
 577
 578     def bsearch(self, query, max_results=50):
 579         q = self.query(query)
 580         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 581
 582         tops = self.searcher.search(bjq, max_results)
 583         bks = []
 584         for found in tops.scoreDocs:
 585             doc = self.searcher.doc(found.doc)
 586             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 587         return (bks, tops.totalHits)
 588
 589
 590 class SearchResult(object):
 591     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 592         if tokens_cache is None: tokens_cache = {}
 593
 594         if score:
 595             self._score = score
 596         else:
 597             self._score = scoreDocs.score
 598
 599         self.boost = 1.0
 600
 601         self._hits = []
 602         self._processed_hits = None  # processed hits
 603
 604         stored = search.searcher.doc(scoreDocs.doc)
 605         self.book_id = int(stored.get("book_id"))
 606
 607         header_type = stored.get("header_type")
 608         if not header_type:
 609             return
 610
 611         sec = (header_type, int(stored.get("header_index")))
 612         header_span = stored.get('header_span')
 613         header_span = header_span is not None and int(header_span) or 1
 614
 615         fragment = stored.get("fragment_anchor")
 616
 617         pd = stored.get("published_date")
 618         if pd is None:
 619             pd = 0
 620         self.published_date = int(pd)
 621
 622         if snippets:
 623             snippets = snippets.replace("/\n", "\n")
 624         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 625
 626         self._hits.append(hit)
 627
 628         self.search = search
 629         self.searched = searched
 630         self.tokens_cache = tokens_cache
 631
 632     @property
 633     def score(self):
 634         return self._score * self.boost
 635
 636     def merge(self, other):
 637         if self.book_id != other.book_id:
 638             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 639         self._hits += other._hits
 640         if other.score > self.score:
 641             self._score = other._score
 642         return self
 643
 644     def get_book(self):
 645         return catalogue.models.Book.objects.get(id=self.book_id)
 646
 647     book = property(get_book)
 648
 649     @property
 650     def hits(self):
 651         if self._processed_hits is not None:
 652             return self._processed_hits
 653
 654         POSITION = 0
 655         FRAGMENT = 1
 656         POSITION_INDEX = 1
 657         POSITION_SPAN = 2
 658         SCORE = 2
 659         OTHER = 3
 660
 661         # to sections and fragments
 662         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 663         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 664         sect = filter(lambda s: 0 == len(filter(
 665             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 666             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 667             frags)), sect)
 668
 669         hits = []
 670
 671         # remove duplicate fragments
 672         fragments = {}
 673         for f in frags:
 674             fid = f[FRAGMENT]
 675             if fid in fragments:
 676                 if fragments[fid][SCORE] >= f[SCORE]:
 677                     continue
 678             fragments[fid] = f
 679         frags = fragments.values()
 680
 681         # remove duplicate sections
 682         sections = {}
 683
 684         for s in sect:
 685             si = s[POSITION][POSITION_INDEX]
 686             # skip existing
 687             if si in sections:
 688                 if sections[si]['score'] >= s[SCORE]:
 689                     continue
 690
 691             m = {'score': s[SCORE],
 692                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 693                  }
 694             m.update(s[OTHER])
 695             sections[si] = m
 696
 697         hits = sections.values()
 698
 699         for f in frags:
 700             try:
 701                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 702             except catalogue.models.Fragment.DoesNotExist:
 703                 # stale index
 704                 continue
 705
 706             # Figure out if we were searching for a token matching some word in theme name.
 707             themes = frag.tags.filter(category='theme')
 708             themes_hit = []
 709             if self.searched is not None:
 710                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 711                 for theme in themes:
 712                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 713                     for t in tokens:
 714                         if t in name_tokens:
 715                             if not theme in themes_hit:
 716                                 themes_hit.append(theme)
 717                             break
 718
 719             m = {'score': f[SCORE],
 720                  'fragment': frag,
 721                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 722                  'themes': themes,
 723                  'themes_hit': themes_hit
 724                  }
 725             m.update(f[OTHER])
 726             hits.append(m)
 727
 728         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 729
 730         self._processed_hits = hits
 731
 732         return hits
 733
 734     def __unicode__(self):
 735         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 736
 737     @staticmethod
 738     def aggregate(*result_lists):
 739         books = {}
 740         for rl in result_lists:
 741             for r in rl:
 742                 if r.book_id in books:
 743                     books[r.book_id].merge(r)
 744                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 745                 else:
 746                     books[r.book_id] = r
 747         return books.values()
 748
 749     def __cmp__(self, other):
 750         c = cmp(self.score, other.score)
 751         if c == 0:
 752             # this is inverted, because earlier date is better
 753             return cmp(other.published_date, self.published_date)
 754         else:
 755             return c
 756
 757
 758 class Hint(object):
 759     """
 760     Given some hint information (information we already know about)
 761     our search target - like author, title (specific book), epoch, genre, kind
 762     we can narrow down search using filters.
 763     """
 764     def __init__(self, search):
 765         """
 766         Accepts a Searcher instance.
 767         """
 768         self.search = search
 769         self.book_tags = {}
 770         self.part_tags = []
 771         self._books = []
 772
 773     def books(self, *books):
 774         """
 775         Give a hint that we search these books.
 776         """
 777         self._books = books
 778
 779     def tags(self, tags):
 780         """
 781         Give a hint that these Tag objects (a list of)
 782         is necessary.
 783         """
 784         for t in tags:
 785             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 786                 lst = self.book_tags.get(t.category, [])
 787                 lst.append(t)
 788                 self.book_tags[t.category] = lst
 789             if t.category in ['theme', 'theme_pl']:
 790                 self.part_tags.append(t)
 791
 792     def tag_filter(self, tags, field='tags'):
 793         """
 794         Given a lsit of tags and an optional field (but they are normally in tags field)
 795         returns a filter accepting only books with specific tags.
 796         """
 797         q = BooleanQuery()
 798
 799         for tag in tags:
 800             toks = self.search.get_tokens(tag.name, field=field)
 801             tag_phrase = PhraseQuery()
 802             for tok in toks:
 803                 tag_phrase.add(Term(field, tok))
 804             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 805
 806         return QueryWrapperFilter(q)
 807
 808     def book_filter(self):
 809         """
 810         Filters using book tags (all tag kinds except a theme)
 811         """
 812         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 813         if tags:
 814             return self.tag_filter(tags)
 815         else:
 816             return None
 817
 818     def part_filter(self):
 819         """
 820         This filter can be used to look for book parts.
 821         It filters on book id and/or themes.
 822         """
 823         fs = []
 824         if self.part_tags:
 825             fs.append(self.tag_filter(self.part_tags, field='themes'))
 826
 827         if self._books != []:
 828             bf = BooleanFilter()
 829             for b in self._books:
 830                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 831                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 832             fs.append(bf)
 833
 834         return Search.chain_filters(fs)
 835
 836     def should_search_for_book(self):
 837         return self._books == []
 838
 839     def just_search_in(self, all):
 840         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 841         some = []
 842         for field in all:
 843             if field == 'authors' and 'author' in self.book_tags:
 844                 continue
 845             if field == 'title' and self._books != []:
 846                 continue
 847             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 848                 continue
 849             some.append(field)
 850         return some
 851
 852
 853 class Search(IndexStore):
 854     """
 855     Search facilities.
 856     """
 857     def __init__(self, default_field="content"):
 858         IndexStore.__init__(self)
 859         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 860         # self.analyzer = WLAnalyzer()
 861         self.searcher = IndexSearcher(self.store, True)
 862         self.parser = QueryParser(Version.LUCENE_34, default_field,
 863                                   self.analyzer)
 864
 865         self.parent_filter = TermsFilter()
 866         self.parent_filter.addTerm(Term("is_book", "true"))
 867
 868     def query(self, query):
 869         """Parse query in default Lucene Syntax. (for humans)
 870         """
 871         return self.parser.parse(query)
 872
 873     def simple_search(self, query, max_results=50):
 874         """Runs a query for books using lucene syntax. (for humans)
 875         Returns (books, total_hits)
 876         """
 877
 878         tops = self.searcher.search(self.query(query), max_results)
 879         bks = []
 880         for found in tops.scoreDocs:
 881             doc = self.searcher.doc(found.doc)
 882             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 883         return (bks, tops.totalHits)
 884
 885     def get_tokens(self, searched, field='content', cached=None):
 886         """returns tokens analyzed by a proper (for a field) analyzer
 887         argument can be: StringReader, string/unicode, or tokens. In the last case
 888         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 889         """
 890         if cached is not None and field in cached:
 891             return cached[field]
 892
 893         if isinstance(searched, str) or isinstance(searched, unicode):
 894             searched = StringReader(searched)
 895         elif isinstance(searched, list):
 896             return searched
 897
 898         searched.reset()
 899         tokens = self.analyzer.reusableTokenStream(field, searched)
 900         toks = []
 901         while tokens.incrementToken():
 902             cta = tokens.getAttribute(CharTermAttribute.class_)
 903             toks.append(cta.toString())
 904
 905         if cached is not None:
 906             cached[field] = toks
 907
 908         return toks
 909
 910     def fuzziness(self, fuzzy):
 911         """Helper method to sanitize fuzziness"""
 912         if not fuzzy:
 913             return None
 914         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 915             return fuzzy
 916         else:
 917             return 0.5
 918
 919     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 920         """
 921         Return a PhraseQuery with a series of tokens.
 922         """
 923         if fuzzy:
 924             phrase = MultiPhraseQuery()
 925             for t in tokens:
 926                 term = Term(field, t)
 927                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 928                 fuzzterms = []
 929
 930                 while True:
 931                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 932                     ft = fuzzterm.term()
 933                     if ft:
 934                         fuzzterms.append(ft)
 935                     if not fuzzterm.next(): break
 936                 if fuzzterms:
 937                     phrase.add(JArray('object')(fuzzterms, Term))
 938                 else:
 939                     phrase.add(term)
 940         else:
 941             phrase = PhraseQuery()
 942             phrase.setSlop(slop)
 943             for t in tokens:
 944                 term = Term(field, t)
 945                 phrase.add(term)
 946         return phrase
 947
 948     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 949         """
 950         Returns term queries joined by boolean query.
 951         modal - applies to boolean query
 952         fuzzy - should the query by fuzzy.
 953         """
 954         q = BooleanQuery()
 955         for t in tokens:
 956             term = Term(field, t)
 957             if fuzzy:
 958                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 959             else:
 960                 term = TermQuery(term)
 961             q.add(BooleanClause(term, modal))
 962         return q
 963
 964     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 965                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 966         if filters is None: filters = []
 967         if tokens_cache is None: tokens_cache = {}
 968
 969         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 970
 971         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 972         if book:
 973             filters.append(self.term_filter(Term('is_book', 'true')))
 974         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 975
 976         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 977
 978     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 979                     filters=None, tokens_cache=None, boost=None, snippets=True):
 980         if filters is None: filters = []
 981         if tokens_cache is None: tokens_cache = {}
 982
 983         if book:
 984             filters.append(self.term_filter(Term('is_book', 'true')))
 985
 986         query = BooleanQuery()
 987
 988         for fld in fields:
 989             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 990
 991             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 992                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 993
 994         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 995
 996         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
 997                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
 998
 999     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1000         """
1001         Search for perfect book matches. Just see if the query matches with some author or title,
1002         taking hints into account.
1003         """
1004         fields_to_search = ['authors', 'title']
1005         only_in = None
1006         if hint:
1007             if not hint.should_search_for_book():
1008                 return []
1009             fields_to_search = hint.just_search_in(fields_to_search)
1010             only_in = hint.book_filter()
1011
1012         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1013
1014         books = []
1015         for q in qrys:
1016             top = self.searcher.search(q,
1017                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1018                 max_results)
1019             for found in top.scoreDocs:
1020                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1021         return books
1022
1023     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1024         fields_to_search = ['tags', 'authors', 'title']
1025
1026         only_in = None
1027         if hint:
1028             if not hint.should_search_for_book():
1029                 return []
1030             fields_to_search = hint.just_search_in(fields_to_search)
1031             only_in = hint.book_filter()
1032
1033         tokens = self.get_tokens(searched, field='SIMPLE')
1034
1035         q = BooleanQuery()
1036
1037         for fld in fields_to_search:
1038             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1039                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1040
1041         books = []
1042         top = self.searcher.search(q,
1043                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1044             max_results)
1045         for found in top.scoreDocs:
1046             books.append(SearchResult(self, found, how_found="search_book"))
1047
1048         return books
1049
1050     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1051         """
1052         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1053         some part/fragment of the book.
1054         """
1055         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1056
1057         flt = None
1058         if hint:
1059             flt = hint.part_filter()
1060
1061         books = []
1062         for q in qrys:
1063             top = self.searcher.search(q,
1064                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1065                                                            flt]),
1066                                        max_results)
1067             for found in top.scoreDocs:
1068                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1069
1070         return books
1071
1072     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1073         """
1074         Tries to use search terms to match different fields of book (or its parts).
1075         E.g. one word can be an author survey, another be a part of the title, and the rest
1076         are some words from third chapter.
1077         """
1078         if tokens_cache is None: tokens_cache = {}
1079         books = []
1080         only_in = None
1081
1082         if hint:
1083             only_in = hint.part_filter()
1084
1085         # content only query : themes x content
1086         q = BooleanQuery()
1087
1088         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1089         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1090
1091         # only search in themes when we do not already filter by themes
1092         if hint is None or hint.just_search_in(['themes']) != []:
1093             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1094                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1095
1096         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1097                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1098
1099         topDocs = self.searcher.search(q, only_in, max_results)
1100         for found in topDocs.scoreDocs:
1101             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1102             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1103
1104         # query themes/content x author/title/tags
1105         q = BooleanQuery()
1106         in_content = BooleanQuery()
1107         in_meta = BooleanQuery()
1108
1109         for fld in ['themes_pl', 'content']:
1110             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1111
1112         for fld in ['tags', 'authors', 'title']:
1113             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1114
1115         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1116         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1117
1118         topDocs = self.searcher.search(q, only_in, max_results)
1119         for found in topDocs.scoreDocs:
1120             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1121             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1122
1123         return books
1124
1125     # def multisearch(self, query, max_results=50):
1126     #     """
1127     #     Search strategy:
1128     #     - (phrase) OR -> content
1129     #                   -> title
1130     #                   -> authors
1131     #     - (keywords)  -> authors
1132     #                   -> motyw
1133     #                   -> tags
1134     #                   -> content
1135     #     """
1136         # queryreader = StringReader(query)
1137         # tokens = self.get_tokens(queryreader)
1138
1139         # top_level = BooleanQuery()
1140         # Should = BooleanClause.Occur.SHOULD
1141
1142         # phrase_level = BooleanQuery()
1143         # phrase_level.setBoost(1.3)
1144
1145         # p_content = self.make_phrase(tokens, joined=True)
1146         # p_title = self.make_phrase(tokens, 'title')
1147         # p_author = self.make_phrase(tokens, 'author')
1148
1149         # phrase_level.add(BooleanClause(p_content, Should))
1150         # phrase_level.add(BooleanClause(p_title, Should))
1151         # phrase_level.add(BooleanClause(p_author, Should))
1152
1153         # kw_level = BooleanQuery()
1154
1155         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1156         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1157         # kw_level.add(j_themes, Should)
1158         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1159         # j_con = self.make_term_query(tokens, joined=True)
1160         # kw_level.add(j_con, Should)
1161
1162         # top_level.add(BooleanClause(phrase_level, Should))
1163         # top_level.add(BooleanClause(kw_level, Should))
1164
1165         # return None
1166
1167     def get_snippets(self, scoreDoc, query, field='content'):
1168         """
1169         Returns a snippet for found scoreDoc.
1170         """
1171         htmlFormatter = SimpleHTMLFormatter()
1172         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1173
1174         stored = self.searcher.doc(scoreDoc.doc)
1175
1176         position = stored.get('snippets_position')
1177         length = stored.get('snippets_length')
1178         if position is None or length is None:
1179             return None
1180         # locate content.
1181         snippets = Snippets(stored.get('book_id')).open()
1182         try:
1183             text = snippets.get((int(position),
1184                                  int(length)))
1185         finally:
1186             snippets.close()
1187
1188         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1189         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1190         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1191
1192         return snip
1193
1194     @staticmethod
1195     def enum_to_array(enum):
1196         """
1197         Converts a lucene TermEnum to array of Terms, suitable for
1198         addition to queries
1199         """
1200         terms = []
1201
1202         while True:
1203             t = enum.term()
1204             if t:
1205                 terms.append(t)
1206             if not enum.next(): break
1207
1208         if terms:
1209             return JArray('object')(terms, Term)
1210
1211     def search_tags(self, query, filter=None, max_results=40):
1212         """
1213         Search for Tag objects using query.
1214         """
1215         tops = self.searcher.search(query, filter, max_results)
1216
1217         tags = []
1218         for found in tops.scoreDocs:
1219             doc = self.searcher.doc(found.doc)
1220             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1221             tags.append(tag)
1222             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1223
1224         return tags
1225
1226     def search_books(self, query, filter=None, max_results=10):
1227         """
1228         Searches for Book objects using query
1229         """
1230         bks = []
1231         tops = self.searcher.search(query, filter, max_results)
1232         for found in tops.scoreDocs:
1233             doc = self.searcher.doc(found.doc)
1234             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1235         return bks
1236
1237     def create_prefix_phrase(self, toks, field):
1238         q = MultiPhraseQuery()
1239         for i in range(len(toks)):
1240             t = Term(field, toks[i])
1241             if i == len(toks) - 1:
1242                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1243                 if pterms:
1244                     q.add(pterms)
1245                 else:
1246                     q.add(t)
1247             else:
1248                 q.add(t)
1249         return q
1250
1251     @staticmethod
1252     def term_filter(term, inverse=False):
1253         only_term = TermsFilter()
1254         only_term.addTerm(term)
1255
1256         if inverse:
1257             neg = BooleanFilter()
1258             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1259             only_term = neg
1260
1261         return only_term
1262
1263     def hint_tags(self, string, max_results=50):
1264         """
1265         Return auto-complete hints for tags
1266         using prefix search.
1267         """
1268         toks = self.get_tokens(string, field='SIMPLE')
1269         top = BooleanQuery()
1270
1271         for field in ['tag_name', 'tag_name_pl']:
1272             q = self.create_prefix_phrase(toks, field)
1273             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1274
1275         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1276
1277         return self.search_tags(top, no_book_cat, max_results=max_results)
1278
1279     def hint_books(self, string, max_results=50):
1280         """
1281         Returns auto-complete hints for book titles
1282         Because we do not index 'pseudo' title-tags.
1283         Prefix search.
1284         """
1285         toks = self.get_tokens(string, field='SIMPLE')
1286
1287         q = self.create_prefix_phrase(toks, 'title')
1288
1289         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1290
1291     @staticmethod
1292     def chain_filters(filters, op=ChainedFilter.AND):
1293         """
1294         Chains a filter list together
1295         """
1296         filters = filter(lambda x: x is not None, filters)
1297         if not filters or filters is []:
1298             return None
1299         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1300         return chf
1301
1302     def filtered_categories(self, tags):
1303         """
1304         Return a list of tag categories, present in tags list.
1305         """
1306         cats = {}
1307         for t in tags:
1308             cats[t.category] = True
1309         return cats.keys()
1310
1311     def hint(self):
1312         return Hint(self)