apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
   5     File, Field, Integer, \
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
   7     QueryParser, PerFieldAnalyzerWrapper, \
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
  16     initVM, CLASSPATH, JArray, JavaError
  17     # KeywordAnalyzer
  18
  19 # Initialize jvm
  20 JVM = initVM(CLASSPATH)
  21
  22 import sys
  23 import os
  24 import re
  25 import errno
  26 from librarian import dcparser
  27 from librarian.parser import WLDocument
  28 from lxml import etree
  29 import catalogue.models
  30 from multiprocessing.pool import ThreadPool
  31 from threading import current_thread
  32 import atexit
  33 import traceback
  34
  35
  36 class WLAnalyzer(PerFieldAnalyzerWrapper):
  37     def __init__(self):
  38         polish = PolishAnalyzer(Version.LUCENE_34)
  39         #        polish_gap.setPositionIncrementGap(999)
  40
  41         simple = SimpleAnalyzer(Version.LUCENE_34)
  42         #        simple_gap.setPositionIncrementGap(999)
  43
  44         keyword = KeywordAnalyzer(Version.LUCENE_34)
  45
  46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
  47
  48         PerFieldAnalyzerWrapper.__init__(self, polish)
  49
  50         self.addAnalyzer("tags", simple)
  51         self.addAnalyzer("technical_editors", simple)
  52         self.addAnalyzer("editors", simple)
  53         self.addAnalyzer("url", keyword)
  54         self.addAnalyzer("source_url", keyword)
  55         self.addAnalyzer("source_name", simple)
  56         self.addAnalyzer("publisher", simple)
  57         self.addAnalyzer("authors", simple)
  58         self.addAnalyzer("title", simple)
  59
  60         self.addAnalyzer("is_book", keyword)
  61         # shouldn't the title have two forms? _pl and simple?
  62
  63         self.addAnalyzer("themes", simple)
  64         self.addAnalyzer("themes_pl", polish)
  65
  66         self.addAnalyzer("tag_name", simple)
  67         self.addAnalyzer("tag_name_pl", polish)
  68
  69         self.addAnalyzer("translators", simple)
  70
  71         self.addAnalyzer("KEYWORD", keyword)
  72         self.addAnalyzer("SIMPLE", simple)
  73         self.addAnalyzer("POLISH", polish)
  74
  75
  76 class IndexStore(object):
  77     """
  78     Provides access to search index.
  79
  80     self.store - lucene index directory
  81     """
  82     def __init__(self):
  83         self.make_index_dir()
  84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
  85
  86     def make_index_dir(self):
  87         try:
  88             os.makedirs(settings.SEARCH_INDEX)
  89         except OSError as exc:
  90             if exc.errno == errno.EEXIST:
  91                 pass
  92             else: raise
  93
  94
  95 class IndexChecker(IndexStore):
  96     def __init__(self):
  97         IndexStore.__init__(self)
  98
  99     def check(self):
 100         checker = CheckIndex(self.store)
 101         status = checker.checkIndex()
 102         return status
 103
 104
 105 class Snippets(object):
 106     """
 107     This class manages snippet files for indexed object (book)
 108     the snippets are concatenated together, and their positions and
 109     lengths are kept in lucene index fields.
 110     """
 111     SNIPPET_DIR = "snippets"
 112
 113     def __init__(self, book_id):
 114         try:
 115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 116         except OSError as exc:
 117             if exc.errno == errno.EEXIST:
 118                 pass
 119             else: raise
 120         self.book_id = book_id
 121         self.file = None
 122
 123     def open(self, mode='r'):
 124         """
 125         Open the snippet file. Call .close() afterwards.
 126         """
 127         if not 'b' in mode:
 128             mode += 'b'
 129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 130         self.position = 0
 131         return self
 132
 133     def add(self, snippet):
 134         """
 135         Append a snippet (unicode) to the snippet file.
 136         Return a (position, length) tuple
 137         """
 138         txt = snippet.encode('utf-8')
 139         l = len(txt)
 140         self.file.write(txt)
 141         pos = (self.position, l)
 142         self.position += l
 143         return pos
 144
 145     def get(self, pos):
 146         """
 147         Given a tuple of (position, length) return an unicode
 148         of the snippet stored there.
 149         """
 150         self.file.seek(pos[0], 0)
 151         txt = self.file.read(pos[1]).decode('utf-8')
 152         return txt
 153
 154     def close(self):
 155         """Close snippet file"""
 156         self.file.close()
 157
 158
 159 class BaseIndex(IndexStore):
 160     """
 161     Base index class.
 162     Provides basic operations on index: opening, closing, optimizing.
 163     """
 164     def __init__(self, analyzer=None):
 165         super(BaseIndex, self).__init__()
 166         self.index = None
 167         if not analyzer:
 168             analyzer = WLAnalyzer()
 169         self.analyzer = analyzer
 170
 171     def open(self, analyzer=None):
 172         if self.index:
 173             raise Exception("Index is already opened")
 174         self.index = IndexWriter(self.store, self.analyzer,\
 175                                  IndexWriter.MaxFieldLength.LIMITED)
 176         return self.index
 177
 178     def optimize(self):
 179         self.index.optimize()
 180
 181     def close(self):
 182         try:
 183             self.index.optimize()
 184         except JavaError, je:
 185             print "Error during optimize phase, check index: %s" % je
 186
 187         self.index.close()
 188         self.index = None
 189
 190     def __enter__(self):
 191         self.open()
 192         return self
 193
 194     def __exit__(self, type, value, tb):
 195         self.close()
 196
 197
 198 class Index(BaseIndex):
 199     """
 200     Class indexing books.
 201     """
 202     def __init__(self, analyzer=None):
 203         super(Index, self).__init__(analyzer)
 204
 205     def index_tags(self):
 206         """
 207         Re-index global tag list.
 208         Removes all tags from index, then index them again.
 209         Indexed fields include: id, name (with and without polish stems), category
 210         """
 211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 212         self.index.deleteDocuments(q)
 213
 214         for tag in catalogue.models.Tag.objects.all():
 215             doc = Document()
 216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 220             self.index.addDocument(doc)
 221
 222     def create_book_doc(self, book):
 223         """
 224         Create a lucene document referring book id.
 225         """
 226         doc = Document()
 227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 228         if book.parent is not None:
 229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 230         return doc
 231
 232     def remove_book(self, book):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 236         self.index.deleteDocuments(q)
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             self.remove_book(book)
 246
 247         book_doc = self.create_book_doc(book)
 248         meta_fields = self.extract_metadata(book, book_info)
 249         for f in meta_fields.values():
 250             if isinstance(f, list) or isinstance(f, tuple):
 251                 for elem in f:
 252                     book_doc.add(elem)
 253             else:
 254                 book_doc.add(f)
 255
 256         self.index.addDocument(book_doc)
 257         del book_doc
 258
 259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 260
 261     master_tags = [
 262         'opowiadanie',
 263         'powiesc',
 264         'dramat_wierszowany_l',
 265         'dramat_wierszowany_lp',
 266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 267         'wywiad',
 268         ]
 269
 270     ignore_content_tags = [
 271         'uwaga', 'extra',
 272         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 273         'didaskalia',
 274         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 275         ]
 276
 277     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 278
 279     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 280
 281     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 282
 283     def extract_metadata(self, book, book_info=None):
 284         """
 285         Extract metadata from book and returns a map of fields keyed by fieldname
 286         """
 287         fields = {}
 288
 289         if book_info is None:
 290             book_info = dcparser.parse(open(book.xml_file.path))
 291
 292         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 293         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 294         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 295
 296         # validator, name
 297         for field in dcparser.BookInfo.FIELDS:
 298             if hasattr(book_info, field.name):
 299                 if not getattr(book_info, field.name):
 300                     continue
 301                 # since no type information is available, we use validator
 302                 type_indicator = field.validator
 303                 if type_indicator == dcparser.as_unicode:
 304                     s = getattr(book_info, field.name)
 305                     if field.multiple:
 306                         s = ', '.join(s)
 307                     try:
 308                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 309                     except JavaError as je:
 310                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 311                 elif type_indicator == dcparser.as_person:
 312                     p = getattr(book_info, field.name)
 313                     if isinstance(p, dcparser.Person):
 314                         persons = unicode(p)
 315                     else:
 316                         persons = ', '.join(map(unicode, p))
 317                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 318                 elif type_indicator == dcparser.as_date:
 319                     dt = getattr(book_info, field.name)
 320                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 321                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 322
 323         # get published date
 324         source = book_info.source_name
 325         match = self.published_date_re.search(source)
 326         print("published date is %s %s" % (match, match is not None and match.groups()))
 327         if match is not None:
 328             fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
 329
 330         return fields
 331
 332     def add_gaps(self, fields, fieldname):
 333         """
 334         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 335         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 336         """
 337         def gap():
 338             while True:
 339                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 340         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 341
 342     def get_master(self, root):
 343         """
 344         Returns the first master tag from an etree.
 345         """
 346         for master in root.iter():
 347             if master.tag in self.master_tags:
 348                 return master
 349
 350     def index_content(self, book, book_fields=[]):
 351         """
 352         Walks the book XML and extract content from it.
 353         Adds parts for each header tag and for each fragment.
 354         """
 355         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 356         root = wld.edoc.getroot()
 357
 358         master = self.get_master(root)
 359         if master is None:
 360             return []
 361
 362         def walker(node, ignore_tags=[]):
 363             yield node, None
 364             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
 365                 for b, e in walker(child):
 366                     yield b, e
 367             yield None, node
 368             return
 369
 370         def fix_format(text):
 371             #            separator = [u" ", u"\t", u".", u";", u","]
 372             if isinstance(text, list):
 373                 # need to join it first
 374                 text = filter(lambda s: s is not None, content)
 375                 text = u' '.join(text)
 376                 # for i in range(len(text)):
 377                 #     if i > 0:
 378                 #         if text[i][0] not in separator\
 379                 #             and text[i - 1][-1] not in separator:
 380                 #          text.insert(i, u" ")
 381
 382             return re.sub("(?m)/$", "", text)
 383
 384         def add_part(snippets, **fields):
 385             doc = self.create_book_doc(book)
 386             for f in book_fields:
 387                 doc.add(f)
 388
 389             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 390             doc.add(NumericField("header_span", Field.Store.YES, True)\
 391                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 392             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 393
 394             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 395                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 396
 397             snip_pos = snippets.add(fields["content"])
 398             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 399             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 400
 401             if 'fragment_anchor' in fields:
 402                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 403                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 404
 405             if 'themes' in fields:
 406                 themes, themes_pl = zip(*[
 407                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 408                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 409                      for theme in fields['themes']])
 410
 411                 themes = self.add_gaps(themes, 'themes')
 412                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 413
 414                 for t in themes:
 415                     doc.add(t)
 416                 for t in themes_pl:
 417                     doc.add(t)
 418
 419             return doc
 420
 421         def give_me_utf8(s):
 422             if isinstance(s, unicode):
 423                 return s.encode('utf-8')
 424             else:
 425                 return s
 426
 427         fragments = {}
 428         snippets = Snippets(book.id).open('w')
 429         try:
 430             for header, position in zip(list(master), range(len(master))):
 431
 432                 if header.tag in self.skip_header_tags:
 433                     continue
 434                 if header.tag is etree.Comment:
 435                     continue
 436
 437                 # section content
 438                 content = []
 439                 footnote = None
 440
 441                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
 442                     # handle footnotes
 443                     # if start is not None and start.tag in self.footnote_tags:
 444                     #     footnote = ' '.join(start.itertext())
 445                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
 446                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
 447                     #                    content=footnote)
 448
 449                     #     self.index.addDocument(doc)
 450
 451                     #     footnote = None
 452
 453                     # handle fragments and themes.
 454                     if start is not None and start.tag == 'begin':
 455                         fid = start.attrib['id'][1:]
 456                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 457
 458                     elif start is not None and start.tag == 'motyw':
 459                         fid = start.attrib['id'][1:]
 460                         if start.text is not None:
 461                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 462
 463                     elif start is not None and start.tag == 'end':
 464                         fid = start.attrib['id'][1:]
 465                         if fid not in fragments:
 466                             continue  # a broken <end> node, skip it
 467                                       #                        import pdb; pdb.set_trace()
 468                         frag = fragments[fid]
 469                         if frag['themes'] == []:
 470                             continue  # empty themes list.
 471                         del fragments[fid]
 472
 473                         doc = add_part(snippets,
 474                                        header_type=frag['start_header'],
 475                                        header_index=frag['start_section'],
 476                                        header_span=position - frag['start_section'] + 1,
 477                                        fragment_anchor=fid,
 478                                        content=fix_format(frag['content']),
 479                                        themes=frag['themes'])
 480
 481                         self.index.addDocument(doc)
 482
 483                         # Collect content.
 484                     elif start is not None:
 485                         for frag in fragments.values():
 486                             frag['content'].append(start.text)
 487                         content.append(start.text)
 488                     elif end is not None:
 489                         for frag in fragments.values():
 490                             frag['content'].append(end.tail)
 491                         content.append(end.tail)
 492
 493                         # in the end, add a section text.
 494                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 495                                content=fix_format(content))
 496
 497                 self.index.addDocument(doc)
 498
 499         finally:
 500             snippets.close()
 501
 502
 503 def log_exception_wrapper(f):
 504     def _wrap(*a):
 505         try:
 506             f(*a)
 507         except Exception, e:
 508             print("Error in indexing thread: %s" % e)
 509             traceback.print_exc()
 510             raise e
 511     return _wrap
 512
 513
 514 class ReusableIndex(Index):
 515     """
 516     Works like index, but does not close/optimize Lucene index
 517     until program exit (uses atexit hook).
 518     This is usefull for importbooks command.
 519
 520     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 521     """
 522     index = None
 523
 524     def open(self, analyzer=None, threads=4):
 525         if ReusableIndex.index is not None:
 526             self.index = ReusableIndex.index
 527         else:
 528             print("opening index")
 529             Index.open(self, analyzer)
 530             ReusableIndex.index = self.index
 531             atexit.register(ReusableIndex.close_reusable)
 532
 533     # def index_book(self, *args, **kw):
 534     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 535     #     ReusableIndex.pool_jobs.append(job)
 536
 537     @staticmethod
 538     def close_reusable():
 539         if ReusableIndex.index is not None:
 540             ReusableIndex.index.optimize()
 541             ReusableIndex.index.close()
 542             ReusableIndex.index = None
 543
 544     def close(self):
 545         pass
 546
 547
 548 class JoinSearch(object):
 549     """
 550     This mixin could be used to handle block join queries.
 551     (currently unused)
 552     """
 553     def __init__(self, *args, **kw):
 554         super(JoinSearch, self).__init__(*args, **kw)
 555
 556     def wrapjoins(self, query, fields=[]):
 557         """
 558         This functions modifies the query in a recursive way,
 559         so Term and Phrase Queries contained, which match
 560         provided fields are wrapped in a BlockJoinQuery,
 561         and so delegated to children documents.
 562         """
 563         if BooleanQuery.instance_(query):
 564             qs = BooleanQuery.cast_(query)
 565             for clause in qs:
 566                 clause = BooleanClause.cast_(clause)
 567                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 568             return qs
 569         else:
 570             termset = HashSet()
 571             query.extractTerms(termset)
 572             for t in termset:
 573                 t = Term.cast_(t)
 574                 if t.field() not in fields:
 575                     return query
 576             return BlockJoinQuery(query, self.parent_filter,
 577                                   BlockJoinQuery.ScoreMode.Total)
 578
 579     def bsearch(self, query, max_results=50):
 580         q = self.query(query)
 581         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 582
 583         tops = self.searcher.search(bjq, max_results)
 584         bks = []
 585         for found in tops.scoreDocs:
 586             doc = self.searcher.doc(found.doc)
 587             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 588         return (bks, tops.totalHits)
 589
 590
 591 class SearchResult(object):
 592     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 593         if tokens_cache is None: tokens_cache = {}
 594
 595         if score:
 596             self._score = score
 597         else:
 598             self._score = scoreDocs.score
 599
 600         self.boost = 1.0
 601
 602         self._hits = []
 603         self._processed_hits = None  # processed hits
 604
 605         stored = search.searcher.doc(scoreDocs.doc)
 606         self.book_id = int(stored.get("book_id"))
 607
 608         header_type = stored.get("header_type")
 609         if not header_type:
 610             return
 611
 612         sec = (header_type, int(stored.get("header_index")))
 613         header_span = stored.get('header_span')
 614         header_span = header_span is not None and int(header_span) or 1
 615
 616         fragment = stored.get("fragment_anchor")
 617
 618         pd = stored.get("published_date")
 619         if pd is None:
 620             print "published_date is none for book %d" % self.book_id
 621             pd = 0
 622         self.published_date = int(pd)
 623
 624         if snippets:
 625             snippets = snippets.replace("/\n", "\n")
 626         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 627
 628         self._hits.append(hit)
 629
 630         self.search = search
 631         self.searched = searched
 632         self.tokens_cache = tokens_cache
 633
 634     @property
 635     def score(self):
 636         return self._score * self.boost
 637
 638     def merge(self, other):
 639         if self.book_id != other.book_id:
 640             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 641         self._hits += other._hits
 642         if other.score > self.score:
 643             self._score = other._score
 644         return self
 645
 646     def get_book(self):
 647         return catalogue.models.Book.objects.get(id=self.book_id)
 648
 649     book = property(get_book)
 650
 651     @property
 652     def hits(self):
 653         if self._processed_hits is not None:
 654             return self._processed_hits
 655
 656         POSITION = 0
 657         FRAGMENT = 1
 658         POSITION_INDEX = 1
 659         POSITION_SPAN = 2
 660         SCORE = 2
 661         OTHER = 3
 662
 663         # to sections and fragments
 664         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 665         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 666         sect = filter(lambda s: 0 == len(filter(
 667             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 668             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 669             frags)), sect)
 670
 671         hits = []
 672
 673         # remove duplicate fragments
 674         fragments = {}
 675         for f in frags:
 676             fid = f[FRAGMENT]
 677             if fid in fragments:
 678                 if fragments[fid][SCORE] >= f[SCORE]:
 679                     continue
 680             fragments[fid] = f
 681         frags = fragments.values()
 682
 683         # remove duplicate sections
 684         sections = {}
 685
 686         for s in sect:
 687             si = s[POSITION][POSITION_INDEX]
 688             # skip existing
 689             if si in sections:
 690                 if sections[si]['score'] >= s[SCORE]:
 691                     continue
 692
 693             m = {'score': s[SCORE],
 694                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 695                  }
 696             m.update(s[OTHER])
 697             sections[si] = m
 698
 699         hits = sections.values()
 700
 701         for f in frags:
 702             try:
 703                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 704             except catalogue.models.Fragment.DoesNotExist:
 705                 # stale index
 706                 continue
 707
 708             # Figure out if we were searching for a token matching some word in theme name.
 709             themes = frag.tags.filter(category='theme')
 710             themes_hit = []
 711             if self.searched is not None:
 712                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 713                 for theme in themes:
 714                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 715                     for t in tokens:
 716                         if t in name_tokens:
 717                             if not theme in themes_hit:
 718                                 themes_hit.append(theme)
 719                             break
 720
 721             m = {'score': f[SCORE],
 722                  'fragment': frag,
 723                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 724                  'themes': themes,
 725                  'themes_hit': themes_hit
 726                  }
 727             m.update(f[OTHER])
 728             hits.append(m)
 729
 730         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 731
 732         self._processed_hits = hits
 733
 734         return hits
 735
 736     def __unicode__(self):
 737         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 738
 739     @staticmethod
 740     def aggregate(*result_lists):
 741         books = {}
 742         for rl in result_lists:
 743             for r in rl:
 744                 if r.book_id in books:
 745                     books[r.book_id].merge(r)
 746                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 747                 else:
 748                     books[r.book_id] = r
 749         return books.values()
 750
 751     def __cmp__(self, other):
 752         c = cmp(self.score, other.score)
 753         if c == 0:
 754             # this is inverted, because earlier date is better
 755             return cmp(other.published_date, self.published_date)
 756         else:
 757             return c
 758
 759
 760 class Hint(object):
 761     """
 762     Given some hint information (information we already know about)
 763     our search target - like author, title (specific book), epoch, genre, kind
 764     we can narrow down search using filters.
 765     """
 766     def __init__(self, search):
 767         """
 768         Accepts a Searcher instance.
 769         """
 770         self.search = search
 771         self.book_tags = {}
 772         self.part_tags = []
 773         self._books = []
 774
 775     def books(self, *books):
 776         """
 777         Give a hint that we search these books.
 778         """
 779         self._books = books
 780
 781     def tags(self, tags):
 782         """
 783         Give a hint that these Tag objects (a list of)
 784         is necessary.
 785         """
 786         for t in tags:
 787             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 788                 lst = self.book_tags.get(t.category, [])
 789                 lst.append(t)
 790                 self.book_tags[t.category] = lst
 791             if t.category in ['theme', 'theme_pl']:
 792                 self.part_tags.append(t)
 793
 794     def tag_filter(self, tags, field='tags'):
 795         """
 796         Given a lsit of tags and an optional field (but they are normally in tags field)
 797         returns a filter accepting only books with specific tags.
 798         """
 799         q = BooleanQuery()
 800
 801         for tag in tags:
 802             toks = self.search.get_tokens(tag.name, field=field)
 803             tag_phrase = PhraseQuery()
 804             for tok in toks:
 805                 tag_phrase.add(Term(field, tok))
 806             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 807
 808         return QueryWrapperFilter(q)
 809
 810     def book_filter(self):
 811         """
 812         Filters using book tags (all tag kinds except a theme)
 813         """
 814         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 815         if tags:
 816             return self.tag_filter(tags)
 817         else:
 818             return None
 819
 820     def part_filter(self):
 821         """
 822         This filter can be used to look for book parts.
 823         It filters on book id and/or themes.
 824         """
 825         fs = []
 826         if self.part_tags:
 827             fs.append(self.tag_filter(self.part_tags, field='themes'))
 828
 829         if self._books != []:
 830             bf = BooleanFilter()
 831             for b in self._books:
 832                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 833                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 834             fs.append(bf)
 835
 836         return Search.chain_filters(fs)
 837
 838     def should_search_for_book(self):
 839         return self._books == []
 840
 841     def just_search_in(self, all):
 842         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 843         some = []
 844         for field in all:
 845             if field == 'authors' and 'author' in self.book_tags:
 846                 continue
 847             if field == 'title' and self._books != []:
 848                 continue
 849             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 850                 continue
 851             some.append(field)
 852         return some
 853
 854
 855 class Search(IndexStore):
 856     """
 857     Search facilities.
 858     """
 859     def __init__(self, default_field="content"):
 860         IndexStore.__init__(self)
 861         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 862         # self.analyzer = WLAnalyzer()
 863         self.searcher = IndexSearcher(self.store, True)
 864         self.parser = QueryParser(Version.LUCENE_34, default_field,
 865                                   self.analyzer)
 866
 867         self.parent_filter = TermsFilter()
 868         self.parent_filter.addTerm(Term("is_book", "true"))
 869
 870     def query(self, query):
 871         """Parse query in default Lucene Syntax. (for humans)
 872         """
 873         return self.parser.parse(query)
 874
 875     def simple_search(self, query, max_results=50):
 876         """Runs a query for books using lucene syntax. (for humans)
 877         Returns (books, total_hits)
 878         """
 879
 880         tops = self.searcher.search(self.query(query), max_results)
 881         bks = []
 882         for found in tops.scoreDocs:
 883             doc = self.searcher.doc(found.doc)
 884             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 885         return (bks, tops.totalHits)
 886
 887     def get_tokens(self, searched, field='content', cached=None):
 888         """returns tokens analyzed by a proper (for a field) analyzer
 889         argument can be: StringReader, string/unicode, or tokens. In the last case
 890         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 891         """
 892         if cached is not None and field in cached:
 893             return cached[field]
 894
 895         if isinstance(searched, str) or isinstance(searched, unicode):
 896             searched = StringReader(searched)
 897         elif isinstance(searched, list):
 898             return searched
 899
 900         searched.reset()
 901         tokens = self.analyzer.reusableTokenStream(field, searched)
 902         toks = []
 903         while tokens.incrementToken():
 904             cta = tokens.getAttribute(CharTermAttribute.class_)
 905             toks.append(cta.toString())
 906
 907         if cached is not None:
 908             cached[field] = toks
 909
 910         return toks
 911
 912     def fuzziness(self, fuzzy):
 913         """Helper method to sanitize fuzziness"""
 914         if not fuzzy:
 915             return None
 916         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 917             return fuzzy
 918         else:
 919             return 0.5
 920
 921     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 922         """
 923         Return a PhraseQuery with a series of tokens.
 924         """
 925         if fuzzy:
 926             phrase = MultiPhraseQuery()
 927             for t in tokens:
 928                 term = Term(field, t)
 929                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 930                 fuzzterms = []
 931
 932                 while True:
 933                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 934                     ft = fuzzterm.term()
 935                     if ft:
 936                         fuzzterms.append(ft)
 937                     if not fuzzterm.next(): break
 938                 if fuzzterms:
 939                     phrase.add(JArray('object')(fuzzterms, Term))
 940                 else:
 941                     phrase.add(term)
 942         else:
 943             phrase = PhraseQuery()
 944             phrase.setSlop(slop)
 945             for t in tokens:
 946                 term = Term(field, t)
 947                 phrase.add(term)
 948         return phrase
 949
 950     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 951         """
 952         Returns term queries joined by boolean query.
 953         modal - applies to boolean query
 954         fuzzy - should the query by fuzzy.
 955         """
 956         q = BooleanQuery()
 957         for t in tokens:
 958             term = Term(field, t)
 959             if fuzzy:
 960                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 961             else:
 962                 term = TermQuery(term)
 963             q.add(BooleanClause(term, modal))
 964         return q
 965
 966     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 967                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 968         if filters is None: filters = []
 969         if tokens_cache is None: tokens_cache = {}
 970
 971         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 972
 973         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 974         if book:
 975             filters.append(self.term_filter(Term('is_book', 'true')))
 976         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 977
 978         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 979
 980     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 981                     filters=None, tokens_cache=None, boost=None, snippets=True):
 982         if filters is None: filters = []
 983         if tokens_cache is None: tokens_cache = {}
 984
 985         if book:
 986             filters.append(self.term_filter(Term('is_book', 'true')))
 987
 988         query = BooleanQuery()
 989
 990         for fld in fields:
 991             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 992
 993             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 994                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 995
 996         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 997
 998         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
 999                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1000
1001     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1002         """
1003         Search for perfect book matches. Just see if the query matches with some author or title,
1004         taking hints into account.
1005         """
1006         fields_to_search = ['authors', 'title']
1007         only_in = None
1008         if hint:
1009             if not hint.should_search_for_book():
1010                 return []
1011             fields_to_search = hint.just_search_in(fields_to_search)
1012             only_in = hint.book_filter()
1013
1014         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1015
1016         books = []
1017         for q in qrys:
1018             top = self.searcher.search(q,
1019                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1020                 max_results)
1021             for found in top.scoreDocs:
1022                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1023         return books
1024
1025     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1026         fields_to_search = ['tags', 'authors', 'title']
1027
1028         only_in = None
1029         if hint:
1030             if not hint.should_search_for_book():
1031                 return []
1032             fields_to_search = hint.just_search_in(fields_to_search)
1033             only_in = hint.book_filter()
1034
1035         tokens = self.get_tokens(searched, field='SIMPLE')
1036
1037         q = BooleanQuery()
1038
1039         for fld in fields_to_search:
1040             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1041                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1042
1043         books = []
1044         top = self.searcher.search(q,
1045                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1046             max_results)
1047         for found in top.scoreDocs:
1048             books.append(SearchResult(self, found, how_found="search_book"))
1049
1050         return books
1051
1052     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1053         """
1054         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1055         some part/fragment of the book.
1056         """
1057         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1058
1059         flt = None
1060         if hint:
1061             flt = hint.part_filter()
1062
1063         books = []
1064         for q in qrys:
1065             top = self.searcher.search(q,
1066                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1067                                                            flt]),
1068                                        max_results)
1069             for found in top.scoreDocs:
1070                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1071
1072         return books
1073
1074     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1075         """
1076         Tries to use search terms to match different fields of book (or its parts).
1077         E.g. one word can be an author survey, another be a part of the title, and the rest
1078         are some words from third chapter.
1079         """
1080         if tokens_cache is None: tokens_cache = {}
1081         books = []
1082         only_in = None
1083
1084         if hint:
1085             only_in = hint.part_filter()
1086
1087         # content only query : themes x content
1088         q = BooleanQuery()
1089
1090         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1091         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1092
1093         # only search in themes when we do not already filter by themes
1094         if hint is None or hint.just_search_in(['themes']) != []:
1095             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1096                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1097
1098         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1099                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1100
1101         topDocs = self.searcher.search(q, only_in, max_results)
1102         for found in topDocs.scoreDocs:
1103             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1104             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1105
1106         # query themes/content x author/title/tags
1107         q = BooleanQuery()
1108         in_content = BooleanQuery()
1109         in_meta = BooleanQuery()
1110
1111         for fld in ['themes_pl', 'content']:
1112             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1113
1114         for fld in ['tags', 'authors', 'title']:
1115             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1116
1117         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1118         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1119
1120         topDocs = self.searcher.search(q, only_in, max_results)
1121         for found in topDocs.scoreDocs:
1122             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1123             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1124
1125         return books
1126
1127     # def multisearch(self, query, max_results=50):
1128     #     """
1129     #     Search strategy:
1130     #     - (phrase) OR -> content
1131     #                   -> title
1132     #                   -> authors
1133     #     - (keywords)  -> authors
1134     #                   -> motyw
1135     #                   -> tags
1136     #                   -> content
1137     #     """
1138         # queryreader = StringReader(query)
1139         # tokens = self.get_tokens(queryreader)
1140
1141         # top_level = BooleanQuery()
1142         # Should = BooleanClause.Occur.SHOULD
1143
1144         # phrase_level = BooleanQuery()
1145         # phrase_level.setBoost(1.3)
1146
1147         # p_content = self.make_phrase(tokens, joined=True)
1148         # p_title = self.make_phrase(tokens, 'title')
1149         # p_author = self.make_phrase(tokens, 'author')
1150
1151         # phrase_level.add(BooleanClause(p_content, Should))
1152         # phrase_level.add(BooleanClause(p_title, Should))
1153         # phrase_level.add(BooleanClause(p_author, Should))
1154
1155         # kw_level = BooleanQuery()
1156
1157         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1158         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1159         # kw_level.add(j_themes, Should)
1160         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1161         # j_con = self.make_term_query(tokens, joined=True)
1162         # kw_level.add(j_con, Should)
1163
1164         # top_level.add(BooleanClause(phrase_level, Should))
1165         # top_level.add(BooleanClause(kw_level, Should))
1166
1167         # return None
1168
1169     def get_snippets(self, scoreDoc, query, field='content'):
1170         """
1171         Returns a snippet for found scoreDoc.
1172         """
1173         htmlFormatter = SimpleHTMLFormatter()
1174         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1175
1176         stored = self.searcher.doc(scoreDoc.doc)
1177
1178         position = stored.get('snippets_position')
1179         length = stored.get('snippets_length')
1180         if position is None or length is None:
1181             return None
1182         # locate content.
1183         snippets = Snippets(stored.get('book_id')).open()
1184         try:
1185             text = snippets.get((int(position),
1186                                  int(length)))
1187         finally:
1188             snippets.close()
1189
1190         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1191         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1192         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1193
1194         return snip
1195
1196     @staticmethod
1197     def enum_to_array(enum):
1198         """
1199         Converts a lucene TermEnum to array of Terms, suitable for
1200         addition to queries
1201         """
1202         terms = []
1203
1204         while True:
1205             t = enum.term()
1206             if t:
1207                 terms.append(t)
1208             if not enum.next(): break
1209
1210         if terms:
1211             return JArray('object')(terms, Term)
1212
1213     def search_tags(self, query, filter=None, max_results=40):
1214         """
1215         Search for Tag objects using query.
1216         """
1217         tops = self.searcher.search(query, filter, max_results)
1218
1219         tags = []
1220         for found in tops.scoreDocs:
1221             doc = self.searcher.doc(found.doc)
1222             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1223             tags.append(tag)
1224             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1225
1226         return tags
1227
1228     def search_books(self, query, filter=None, max_results=10):
1229         """
1230         Searches for Book objects using query
1231         """
1232         bks = []
1233         tops = self.searcher.search(query, filter, max_results)
1234         for found in tops.scoreDocs:
1235             doc = self.searcher.doc(found.doc)
1236             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1237         return bks
1238
1239     def create_prefix_phrase(self, toks, field):
1240         q = MultiPhraseQuery()
1241         for i in range(len(toks)):
1242             t = Term(field, toks[i])
1243             if i == len(toks) - 1:
1244                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1245                 if pterms:
1246                     q.add(pterms)
1247                 else:
1248                     q.add(t)
1249             else:
1250                 q.add(t)
1251         return q
1252
1253     @staticmethod
1254     def term_filter(term, inverse=False):
1255         only_term = TermsFilter()
1256         only_term.addTerm(term)
1257
1258         if inverse:
1259             neg = BooleanFilter()
1260             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1261             only_term = neg
1262
1263         return only_term
1264
1265     def hint_tags(self, string, max_results=50):
1266         """
1267         Return auto-complete hints for tags
1268         using prefix search.
1269         """
1270         toks = self.get_tokens(string, field='SIMPLE')
1271         top = BooleanQuery()
1272
1273         for field in ['tag_name', 'tag_name_pl']:
1274             q = self.create_prefix_phrase(toks, field)
1275             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1276
1277         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1278
1279         return self.search_tags(top, no_book_cat, max_results=max_results)
1280
1281     def hint_books(self, string, max_results=50):
1282         """
1283         Returns auto-complete hints for book titles
1284         Because we do not index 'pseudo' title-tags.
1285         Prefix search.
1286         """
1287         toks = self.get_tokens(string, field='SIMPLE')
1288
1289         q = self.create_prefix_phrase(toks, 'title')
1290
1291         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1292
1293     @staticmethod
1294     def chain_filters(filters, op=ChainedFilter.AND):
1295         """
1296         Chains a filter list together
1297         """
1298         filters = filter(lambda x: x is not None, filters)
1299         if not filters or filters is []:
1300             return None
1301         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1302         return chf
1303
1304     def filtered_categories(self, tags):
1305         """
1306         Return a list of tag categories, present in tags list.
1307         """
1308         cats = {}
1309         for t in tags:
1310             cats[t.category] = True
1311         return cats.keys()
1312
1313     def hint(self):
1314         return Hint(self)