1 # -*- coding: utf-8 -*-
 
   3 from django.conf import settings
 
   4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
 
   5     File, Field, Integer, \
 
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
 
   7     QueryParser, PerFieldAnalyzerWrapper, \
 
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
 
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
 
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
 
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
 
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
 
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
 
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
 
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
 
  16     initVM, CLASSPATH, JArray, JavaError
 
  20 JVM = initVM(CLASSPATH)
 
  26 from librarian import dcparser
 
  27 from librarian.parser import WLDocument
 
  28 from lxml import etree
 
  29 import catalogue.models
 
  30 from pdcounter.models import Author as PDCounterAuthor
 
  31 from multiprocessing.pool import ThreadPool
 
  32 from threading import current_thread
 
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
 
  39         polish = PolishAnalyzer(Version.LUCENE_34)
 
  40         #        polish_gap.setPositionIncrementGap(999)
 
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
 
  43         #        simple_gap.setPositionIncrementGap(999)
 
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
 
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
 
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
 
  51         self.addAnalyzer("tags", simple)
 
  52         self.addAnalyzer("technical_editors", simple)
 
  53         self.addAnalyzer("editors", simple)
 
  54         self.addAnalyzer("url", keyword)
 
  55         self.addAnalyzer("source_url", keyword)
 
  56         self.addAnalyzer("source_name", simple)
 
  57         self.addAnalyzer("publisher", simple)
 
  58         self.addAnalyzer("authors", simple)
 
  59         self.addAnalyzer("title", simple)
 
  61         self.addAnalyzer("is_book", keyword)
 
  62         # shouldn't the title have two forms? _pl and simple?
 
  64         self.addAnalyzer("themes", simple)
 
  65         self.addAnalyzer("themes_pl", polish)
 
  67         self.addAnalyzer("tag_name", simple)
 
  68         self.addAnalyzer("tag_name_pl", polish)
 
  70         self.addAnalyzer("translators", simple)
 
  72         self.addAnalyzer("KEYWORD", keyword)
 
  73         self.addAnalyzer("SIMPLE", simple)
 
  74         self.addAnalyzer("POLISH", polish)
 
  77 class IndexStore(object):
 
  79     Provides access to search index.
 
  81     self.store - lucene index directory
 
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
 
  87     def make_index_dir(self):
 
  89             os.makedirs(settings.SEARCH_INDEX)
 
  90         except OSError as exc:
 
  91             if exc.errno == errno.EEXIST:
 
  96 class IndexChecker(IndexStore):
 
  98         IndexStore.__init__(self)
 
 101         checker = CheckIndex(self.store)
 
 102         status = checker.checkIndex()
 
 106 class Snippets(object):
 
 108     This class manages snippet files for indexed object (book)
 
 109     the snippets are concatenated together, and their positions and
 
 110     lengths are kept in lucene index fields.
 
 112     SNIPPET_DIR = "snippets"
 
 114     def __init__(self, book_id):
 
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 
 117         except OSError as exc:
 
 118             if exc.errno == errno.EEXIST:
 
 121         self.book_id = book_id
 
 124     def open(self, mode='r'):
 
 126         Open the snippet file. Call .close() afterwards.
 
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 
 134     def add(self, snippet):
 
 136         Append a snippet (unicode) to the snippet file.
 
 137         Return a (position, length) tuple
 
 139         txt = snippet.encode('utf-8')
 
 142         pos = (self.position, l)
 
 148         Given a tuple of (position, length) return an unicode
 
 149         of the snippet stored there.
 
 151         self.file.seek(pos[0], 0)
 
 152         txt = self.file.read(pos[1]).decode('utf-8')
 
 156         """Close snippet file"""
 
 160 class BaseIndex(IndexStore):
 
 163     Provides basic operations on index: opening, closing, optimizing.
 
 165     def __init__(self, analyzer=None):
 
 166         super(BaseIndex, self).__init__()
 
 169             analyzer = WLAnalyzer()
 
 170         self.analyzer = analyzer
 
 172     def open(self, analyzer=None, timeout=None):
 
 174             raise Exception("Index is already opened")
 
 175         conf = IndexWriterConfig(Version.LUCENE_34, analyzer)
 
 177             conf.setWriteLockTimeout(long(timeout))
 
 178         self.index = IndexWriter(self.store, conf)
 
 182         self.index.optimize()
 
 186             self.index.optimize()
 
 187         except JavaError, je:
 
 188             print "Error during optimize phase, check index: %s" % je
 
 197     def __exit__(self, type, value, tb):
 
 201 class Index(BaseIndex):
 
 203     Class indexing books.
 
 205     def __init__(self, analyzer=None):
 
 206         super(Index, self).__init__(analyzer)
 
 208     def index_tags(self):
 
 210         Re-index global tag list.
 
 211         Removes all tags from index, then index them again.
 
 212         Indexed fields include: id, name (with and without polish stems), category
 
 214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 
 215         self.index.deleteDocuments(q)
 
 217         for tag in catalogue.models.Tag.objects.all():
 
 219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 
 220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 223             self.index.addDocument(doc)
 
 225         for pdtag in PDCounterAuthor.objects.all():
 
 227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 
 228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 230             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 232             self.index.addDocument(doc)
 
 234     def create_book_doc(self, book):
 
 236         Create a lucene document referring book id.
 
 239         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 
 240         if book.parent is not None:
 
 241             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 
 244     def remove_book(self, book):
 
 245         """Removes a book from search index.
 
 246         book - Book instance."""
 
 247         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 
 248         self.index.deleteDocuments(q)
 
 250     def index_book(self, book, book_info=None, overwrite=True):
 
 253         Creates a lucene document for extracted metadata
 
 254         and calls self.index_content() to index the contents of the book.
 
 257             self.remove_book(book)
 
 259         book_doc = self.create_book_doc(book)
 
 260         meta_fields = self.extract_metadata(book, book_info)
 
 261         for f in meta_fields.values():
 
 262             if isinstance(f, list) or isinstance(f, tuple):
 
 268         self.index.addDocument(book_doc)
 
 271         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 
 276         'dramat_wierszowany_l',
 
 277         'dramat_wierszowany_lp',
 
 278         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 
 282     ignore_content_tags = [
 
 284         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 
 286         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 
 289     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 
 291     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 
 293     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 
 295     def extract_metadata(self, book, book_info=None):
 
 297         Extract metadata from book and returns a map of fields keyed by fieldname
 
 301         if book_info is None:
 
 302             book_info = dcparser.parse(open(book.xml_file.path))
 
 304         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 
 305         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 
 306         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 309         for field in dcparser.BookInfo.FIELDS:
 
 310             if hasattr(book_info, field.name):
 
 311                 if not getattr(book_info, field.name):
 
 313                 # since no type information is available, we use validator
 
 314                 type_indicator = field.validator
 
 315                 if type_indicator == dcparser.as_unicode:
 
 316                     s = getattr(book_info, field.name)
 
 320                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 
 321                     except JavaError as je:
 
 322                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 
 323                 elif type_indicator == dcparser.as_person:
 
 324                     p = getattr(book_info, field.name)
 
 325                     if isinstance(p, dcparser.Person):
 
 328                         persons = ', '.join(map(unicode, p))
 
 329                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 
 330                 elif type_indicator == dcparser.as_date:
 
 331                     dt = getattr(book_info, field.name)
 
 332                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 
 333                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 337         if hasattr(book_info, 'source_name') and book_info.source_name:
 
 338             match = self.published_date_re.search(book_info.source_name)
 
 339             if match is not None:
 
 340                 pd = str(match.groups()[0])
 
 342         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 
 346     def add_gaps(self, fields, fieldname):
 
 348         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 
 349         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 
 353                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 354         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 
 356     def get_master(self, root):
 
 358         Returns the first master tag from an etree.
 
 360         for master in root.iter():
 
 361             if master.tag in self.master_tags:
 
 364     def index_content(self, book, book_fields=[]):
 
 366         Walks the book XML and extract content from it.
 
 367         Adds parts for each header tag and for each fragment.
 
 369         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 
 370         root = wld.edoc.getroot()
 
 372         master = self.get_master(root)
 
 376         def walker(node, ignore_tags=[]):
 
 378             if node.tag not in ignore_tags:
 
 379                 yield node, None, None
 
 380                 if node.text is not None:
 
 381                     yield None, node.text, None
 
 382                 for child in list(node):
 
 383                     for b, t, e in walker(child):
 
 385                 yield None, None, node
 
 387             if node.tail is not None:
 
 388                 yield None, node.tail, None
 
 391         def fix_format(text):
 
 392             #            separator = [u" ", u"\t", u".", u";", u","]
 
 393             if isinstance(text, list):
 
 394                 # need to join it first
 
 395                 text = filter(lambda s: s is not None, content)
 
 396                 text = u' '.join(text)
 
 397                 # for i in range(len(text)):
 
 399                 #         if text[i][0] not in separator\
 
 400                 #             and text[i - 1][-1] not in separator:
 
 401                 #          text.insert(i, u" ")
 
 403             return re.sub("(?m)/$", "", text)
 
 405         def add_part(snippets, **fields):
 
 406             doc = self.create_book_doc(book)
 
 407             for f in book_fields:
 
 410             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 
 411             doc.add(NumericField("header_span", Field.Store.YES, True)\
 
 412                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 
 413             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 415             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 
 416                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 
 418             snip_pos = snippets.add(fields["content"])
 
 419             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 
 420             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 
 422             if 'fragment_anchor' in fields:
 
 423                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 
 424                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 426             if 'themes' in fields:
 
 427                 themes, themes_pl = zip(*[
 
 428                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 
 429                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 
 430                      for theme in fields['themes']])
 
 432                 themes = self.add_gaps(themes, 'themes')
 
 433                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 
 443             if isinstance(s, unicode):
 
 444                 return s.encode('utf-8')
 
 449         snippets = Snippets(book.id).open('w')
 
 451             for header, position in zip(list(master), range(len(master))):
 
 453                 if header.tag in self.skip_header_tags:
 
 455                 if header.tag is etree.Comment:
 
 462                 def all_content(text):
 
 463                     for frag in fragments.values():
 
 464                         frag['content'].append(text)
 
 466                 handle_text = [all_content]
 
 469                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 
 471                     if start is not None and start.tag in self.footnote_tags:
 
 473                         def collect_footnote(t):
 
 475                         handle_text.append(collect_footnote)
 
 476                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 
 478                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 
 479                                        content=u''.join(footnote),
 
 480                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 482                         self.index.addDocument(doc)
 
 483                         #print "@ footnote text: %s" % footnote
 
 486                     # handle fragments and themes.
 
 487                     if start is not None and start.tag == 'begin':
 
 488                         fid = start.attrib['id'][1:]
 
 489                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 
 491                     # themes for this fragment
 
 492                     elif start is not None and start.tag == 'motyw':
 
 493                         fid = start.attrib['id'][1:]
 
 494                         handle_text.append(None)
 
 495                         if start.text is not None:
 
 496                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 
 497                     elif end is not None and end.tag == 'motyw':
 
 500                     elif start is not None and start.tag == 'end':
 
 501                         fid = start.attrib['id'][1:]
 
 502                         if fid not in fragments:
 
 503                             continue  # a broken <end> node, skip it
 
 504                         frag = fragments[fid]
 
 505                         if frag['themes'] == []:
 
 506                             continue  # empty themes list.
 
 509                         doc = add_part(snippets,
 
 510                                        header_type=frag['start_header'],
 
 511                                        header_index=frag['start_section'],
 
 512                                        header_span=position - frag['start_section'] + 1,
 
 514                                        content=fix_format(frag['content']),
 
 515                                        themes=frag['themes'])
 
 516                         #print '@ FRAG %s' % frag['content']
 
 517                         self.index.addDocument(doc)
 
 521                     if text is not None and handle_text is not []:
 
 522                         hdl = handle_text[-1]
 
 526                         # in the end, add a section text.
 
 527                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 
 528                                content=fix_format(content))
 
 529                 #print '@ CONTENT: %s' % fix_format(content)
 
 531                 self.index.addDocument(doc)
 
 537 def log_exception_wrapper(f):
 
 542             print("Error in indexing thread: %s" % e)
 
 543             traceback.print_exc()
 
 548 class ReusableIndex(Index):
 
 550     Works like index, but does not close/optimize Lucene index
 
 551     until program exit (uses atexit hook).
 
 552     This is usefull for importbooks command.
 
 554     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 
 558     def open(self, analyzer=None, **kw):
 
 559         if ReusableIndex.index:
 
 560             self.index = ReusableIndex.index
 
 562             print("opening index")
 
 563             Index.open(self, analyzer, **kw)
 
 564             ReusableIndex.index = self.index
 
 565             atexit.register(ReusableIndex.close_reusable)
 
 567     # def index_book(self, *args, **kw):
 
 568     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 
 569     #     ReusableIndex.pool_jobs.append(job)
 
 572     def close_reusable():
 
 573         if ReusableIndex.index:
 
 574             print("closing index")
 
 575             ReusableIndex.index.optimize()
 
 576             ReusableIndex.index.close()
 
 577             ReusableIndex.index = None
 
 580         if ReusableIndex.index:
 
 581             ReusableIndex.index.commit()
 
 584 class JoinSearch(object):
 
 586     This mixin could be used to handle block join queries.
 
 589     def __init__(self, *args, **kw):
 
 590         super(JoinSearch, self).__init__(*args, **kw)
 
 592     def wrapjoins(self, query, fields=[]):
 
 594         This functions modifies the query in a recursive way,
 
 595         so Term and Phrase Queries contained, which match
 
 596         provided fields are wrapped in a BlockJoinQuery,
 
 597         and so delegated to children documents.
 
 599         if BooleanQuery.instance_(query):
 
 600             qs = BooleanQuery.cast_(query)
 
 602                 clause = BooleanClause.cast_(clause)
 
 603                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 
 607             query.extractTerms(termset)
 
 610                 if t.field() not in fields:
 
 612             return BlockJoinQuery(query, self.parent_filter,
 
 613                                   BlockJoinQuery.ScoreMode.Total)
 
 615     def bsearch(self, query, max_results=50):
 
 616         q = self.query(query)
 
 617         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 
 619         tops = self.searcher.search(bjq, max_results)
 
 621         for found in tops.scoreDocs:
 
 622             doc = self.searcher.doc(found.doc)
 
 623             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 624         return (bks, tops.totalHits)
 
 627 class SearchResult(object):
 
 628     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 
 629         if tokens_cache is None: tokens_cache = {}
 
 634             self._score = scoreDocs.score
 
 639         self._processed_hits = None  # processed hits
 
 641         stored = search.searcher.doc(scoreDocs.doc)
 
 642         self.book_id = int(stored.get("book_id"))
 
 644         pd = stored.get("published_date")
 
 647         self.published_date = int(pd)
 
 649         header_type = stored.get("header_type")
 
 650         # we have a content hit in some header of fragment
 
 651         if header_type is not None:
 
 652             sec = (header_type, int(stored.get("header_index")))
 
 653             header_span = stored.get('header_span')
 
 654             header_span = header_span is not None and int(header_span) or 1
 
 656             fragment = stored.get("fragment_anchor")
 
 659                 snippets = snippets.replace("/\n", "\n")
 
 660             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 
 662             self._hits.append(hit)
 
 665         self.searched = searched
 
 666         self.tokens_cache = tokens_cache
 
 670         return self._score * self.boost
 
 672     def merge(self, other):
 
 673         if self.book_id != other.book_id:
 
 674             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 
 675         self._hits += other._hits
 
 676         if other.score > self.score:
 
 677             self._score = other._score
 
 681         return catalogue.models.Book.objects.get(id=self.book_id)
 
 683     book = property(get_book)
 
 687         if self._processed_hits is not None:
 
 688             return self._processed_hits
 
 697         # to sections and fragments
 
 698         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 
 699         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 
 700         sect = filter(lambda s: 0 == len(filter(
 
 701             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 
 702             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 
 707         # remove duplicate fragments
 
 712                 if fragments[fid][SCORE] >= f[SCORE]:
 
 715         frags = fragments.values()
 
 717         # remove duplicate sections
 
 721             si = s[POSITION][POSITION_INDEX]
 
 724                 if sections[si]['score'] >= s[SCORE]:
 
 727             m = {'score': s[SCORE],
 
 728                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 
 733         hits = sections.values()
 
 737                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 
 738             except catalogue.models.Fragment.DoesNotExist:
 
 742             # Figure out if we were searching for a token matching some word in theme name.
 
 743             themes = frag.tags.filter(category='theme')
 
 745             if self.searched is not None:
 
 746                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 
 748                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 
 751                             if not theme in themes_hit:
 
 752                                 themes_hit.append(theme)
 
 755             m = {'score': f[SCORE],
 
 757                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 
 759                  'themes_hit': themes_hit
 
 764         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 
 766         self._processed_hits = hits
 
 770     def __unicode__(self):
 
 771         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 
 774     def aggregate(*result_lists):
 
 776         for rl in result_lists:
 
 778                 if r.book_id in books:
 
 779                     books[r.book_id].merge(r)
 
 780                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 
 783         return books.values()
 
 785     def __cmp__(self, other):
 
 786         c = cmp(self.score, other.score)
 
 788             # this is inverted, because earlier date is better
 
 789             return cmp(other.published_date, self.published_date)
 
 796     Given some hint information (information we already know about)
 
 797     our search target - like author, title (specific book), epoch, genre, kind
 
 798     we can narrow down search using filters.
 
 800     def __init__(self, search):
 
 802         Accepts a Searcher instance.
 
 809     def books(self, *books):
 
 811         Give a hint that we search these books.
 
 815     def tags(self, tags):
 
 817         Give a hint that these Tag objects (a list of)
 
 821             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 
 822                 lst = self.book_tags.get(t.category, [])
 
 824                 self.book_tags[t.category] = lst
 
 825             if t.category in ['theme', 'theme_pl']:
 
 826                 self.part_tags.append(t)
 
 828     def tag_filter(self, tags, field='tags'):
 
 830         Given a lsit of tags and an optional field (but they are normally in tags field)
 
 831         returns a filter accepting only books with specific tags.
 
 836             toks = self.search.get_tokens(tag.name, field=field)
 
 837             tag_phrase = PhraseQuery()
 
 839                 tag_phrase.add(Term(field, tok))
 
 840             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 
 842         return QueryWrapperFilter(q)
 
 844     def book_filter(self):
 
 846         Filters using book tags (all tag kinds except a theme)
 
 848         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 
 850             return self.tag_filter(tags)
 
 854     def part_filter(self):
 
 856         This filter can be used to look for book parts.
 
 857         It filters on book id and/or themes.
 
 861             fs.append(self.tag_filter(self.part_tags, field='themes'))
 
 863         if self._books != []:
 
 865             for b in self._books:
 
 866                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 
 867                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 
 870         return Search.chain_filters(fs)
 
 872     def should_search_for_book(self):
 
 873         return self._books == []
 
 875     def just_search_in(self, all):
 
 876         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 
 879             if field == 'authors' and 'author' in self.book_tags:
 
 881             if field == 'title' and self._books != []:
 
 883             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 
 889 class Search(IndexStore):
 
 893     def __init__(self, default_field="content"):
 
 894         IndexStore.__init__(self)
 
 895         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 
 896         # self.analyzer = WLAnalyzer()
 
 897         self.searcher = IndexSearcher(self.store, True)
 
 898         self.parser = QueryParser(Version.LUCENE_34, default_field,
 
 901         self.parent_filter = TermsFilter()
 
 902         self.parent_filter.addTerm(Term("is_book", "true"))
 
 904     def query(self, query):
 
 905         """Parse query in default Lucene Syntax. (for humans)
 
 907         return self.parser.parse(query)
 
 909     def simple_search(self, query, max_results=50):
 
 910         """Runs a query for books using lucene syntax. (for humans)
 
 911         Returns (books, total_hits)
 
 914         tops = self.searcher.search(self.query(query), max_results)
 
 916         for found in tops.scoreDocs:
 
 917             doc = self.searcher.doc(found.doc)
 
 918             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 919         return (bks, tops.totalHits)
 
 921     def get_tokens(self, searched, field='content', cached=None):
 
 922         """returns tokens analyzed by a proper (for a field) analyzer
 
 923         argument can be: StringReader, string/unicode, or tokens. In the last case
 
 924         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 
 926         if cached is not None and field in cached:
 
 929         if isinstance(searched, str) or isinstance(searched, unicode):
 
 930             searched = StringReader(searched)
 
 931         elif isinstance(searched, list):
 
 935         tokens = self.analyzer.reusableTokenStream(field, searched)
 
 937         while tokens.incrementToken():
 
 938             cta = tokens.getAttribute(CharTermAttribute.class_)
 
 939             toks.append(cta.toString())
 
 941         if cached is not None:
 
 946     def fuzziness(self, fuzzy):
 
 947         """Helper method to sanitize fuzziness"""
 
 950         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 
 955     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 
 957         Return a PhraseQuery with a series of tokens.
 
 960             phrase = MultiPhraseQuery()
 
 962                 term = Term(field, t)
 
 963                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 
 967                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 
 971                     if not fuzzterm.next(): break
 
 973                     phrase.add(JArray('object')(fuzzterms, Term))
 
 977             phrase = PhraseQuery()
 
 980                 term = Term(field, t)
 
 984     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 
 986         Returns term queries joined by boolean query.
 
 987         modal - applies to boolean query
 
 988         fuzzy - should the query by fuzzy.
 
 992             term = Term(field, t)
 
 994                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 
 996                 term = TermQuery(term)
 
 997             q.add(BooleanClause(term, modal))
 
1000     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 
1001                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 
1002         if filters is None: filters = []
 
1003         if tokens_cache is None: tokens_cache = {}
 
1005         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 
1007         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 
1009             filters.append(self.term_filter(Term('is_book', 'true')))
 
1010         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
1012         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 
1014     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 
1015                     filters=None, tokens_cache=None, boost=None, snippets=True):
 
1016         if filters is None: filters = []
 
1017         if tokens_cache is None: tokens_cache = {}
 
1020             filters.append(self.term_filter(Term('is_book', 'true')))
 
1022         query = BooleanQuery()
 
1025             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 
1027             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 
1028                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1030         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
1032         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
 
1033                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
 
1035     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 
1037         Search for perfect book matches. Just see if the query matches with some author or title,
 
1038         taking hints into account.
 
1040         fields_to_search = ['authors', 'title']
 
1043             if not hint.should_search_for_book():
 
1045             fields_to_search = hint.just_search_in(fields_to_search)
 
1046             only_in = hint.book_filter()
 
1048         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 
1052             top = self.searcher.search(q,
 
1053                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 
1055             for found in top.scoreDocs:
 
1056                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
 
1059     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 
1060         fields_to_search = ['tags', 'authors', 'title']
 
1064             if not hint.should_search_for_book():
 
1066             fields_to_search = hint.just_search_in(fields_to_search)
 
1067             only_in = hint.book_filter()
 
1069         tokens = self.get_tokens(searched, field='SIMPLE')
 
1073         for fld in fields_to_search:
 
1074             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 
1075                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1078         top = self.searcher.search(q,
 
1079                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 
1081         for found in top.scoreDocs:
 
1082             books.append(SearchResult(self, found, how_found="search_book"))
 
1086     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 
1088         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 
1089         some part/fragment of the book.
 
1091         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 
1095             flt = hint.part_filter()
 
1099             top = self.searcher.search(q,
 
1100                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 
1103             for found in top.scoreDocs:
 
1104                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 
1108     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
 
1110         Tries to use search terms to match different fields of book (or its parts).
 
1111         E.g. one word can be an author survey, another be a part of the title, and the rest
 
1112         are some words from third chapter.
 
1114         if tokens_cache is None: tokens_cache = {}
 
1119             only_in = hint.part_filter()
 
1121         # content only query : themes x content
 
1124         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
 
1125         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
 
1127         # only search in themes when we do not already filter by themes
 
1128         if hint is None or hint.just_search_in(['themes']) != []:
 
1129             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 
1130                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 
1132         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 
1133                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1135         topDocs = self.searcher.search(q, only_in, max_results)
 
1136         for found in topDocs.scoreDocs:
 
1137             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
 
1138             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 
1140         # query themes/content x author/title/tags
 
1142         in_content = BooleanQuery()
 
1143         in_meta = BooleanQuery()
 
1145         for fld in ['themes_pl', 'content']:
 
1146             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 
1148         for fld in ['tags', 'authors', 'title']:
 
1149             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 
1151         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 
1152         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 
1154         topDocs = self.searcher.search(q, only_in, max_results)
 
1155         for found in topDocs.scoreDocs:
 
1156             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
 
1157             print "* %s scatter search: %s" % (searched, books[-1]._hits)
 
1161     # def multisearch(self, query, max_results=50):
 
1164     #     - (phrase) OR -> content
 
1167     #     - (keywords)  -> authors
 
1172         # queryreader = StringReader(query)
 
1173         # tokens = self.get_tokens(queryreader)
 
1175         # top_level = BooleanQuery()
 
1176         # Should = BooleanClause.Occur.SHOULD
 
1178         # phrase_level = BooleanQuery()
 
1179         # phrase_level.setBoost(1.3)
 
1181         # p_content = self.make_phrase(tokens, joined=True)
 
1182         # p_title = self.make_phrase(tokens, 'title')
 
1183         # p_author = self.make_phrase(tokens, 'author')
 
1185         # phrase_level.add(BooleanClause(p_content, Should))
 
1186         # phrase_level.add(BooleanClause(p_title, Should))
 
1187         # phrase_level.add(BooleanClause(p_author, Should))
 
1189         # kw_level = BooleanQuery()
 
1191         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
 
1192         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
 
1193         # kw_level.add(j_themes, Should)
 
1194         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 
1195         # j_con = self.make_term_query(tokens, joined=True)
 
1196         # kw_level.add(j_con, Should)
 
1198         # top_level.add(BooleanClause(phrase_level, Should))
 
1199         # top_level.add(BooleanClause(kw_level, Should))
 
1203     def get_snippets(self, scoreDoc, query, field='content'):
 
1205         Returns a snippet for found scoreDoc.
 
1207         htmlFormatter = SimpleHTMLFormatter()
 
1208         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
 
1210         stored = self.searcher.doc(scoreDoc.doc)
 
1212         position = stored.get('snippets_position')
 
1213         length = stored.get('snippets_length')
 
1214         if position is None or length is None:
 
1217         book_id = int(stored.get('book_id'))
 
1218         snippets = Snippets(book_id).open()
 
1221                 text = snippets.get((int(position),
 
1226             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
 
1227             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
 
1228             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 
1230         except Exception, e:
 
1232             if hasattr(e, 'getJavaException'):
 
1233                 e2 = unicode(e.getJavaException())
 
1234             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
 
1239     def enum_to_array(enum):
 
1241         Converts a lucene TermEnum to array of Terms, suitable for
 
1250             if not enum.next(): break
 
1253             return JArray('object')(terms, Term)
 
1255     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
 
1257         Search for Tag objects using query.
 
1260             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
 
1261         tops = self.searcher.search(query, filters, max_results)
 
1264         for found in tops.scoreDocs:
 
1265             doc = self.searcher.doc(found.doc)
 
1266             is_pdcounter = doc.get('is_pdcounter')
 
1268                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 
1270                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 
1271                 # don't add the pdcounter tag if same tag already exists
 
1272             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
 
1274                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
 
1275         print 'returning %s' % tags
 
1278     def search_books(self, query, filter=None, max_results=10):
 
1280         Searches for Book objects using query
 
1283         tops = self.searcher.search(query, filter, max_results)
 
1284         for found in tops.scoreDocs:
 
1285             doc = self.searcher.doc(found.doc)
 
1286             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
1289     def make_prefix_phrase(self, toks, field):
 
1290         q = MultiPhraseQuery()
 
1291         for i in range(len(toks)):
 
1292             t = Term(field, toks[i])
 
1293             if i == len(toks) - 1:
 
1294                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
 
1304     def term_filter(term, inverse=False):
 
1305         only_term = TermsFilter()
 
1306         only_term.addTerm(term)
 
1309             neg = BooleanFilter()
 
1310             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
 
1315     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
 
1317         Return auto-complete hints for tags
 
1318         using prefix search.
 
1320         toks = self.get_tokens(string, field='SIMPLE')
 
1321         top = BooleanQuery()
 
1323         for field in ['tag_name', 'tag_name_pl']:
 
1325                 q = self.make_prefix_phrase(toks, field)
 
1327                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
 
1328             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
 
1330         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
 
1332         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
 
1334     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
 
1336         Returns auto-complete hints for book titles
 
1337         Because we do not index 'pseudo' title-tags.
 
1340         toks = self.get_tokens(string, field='SIMPLE')
 
1343             q = self.make_prefix_phrase(toks, 'title')
 
1345             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
 
1347         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
 
1350     def chain_filters(filters, op=ChainedFilter.AND):
 
1352         Chains a filter list together
 
1354         filters = filter(lambda x: x is not None, filters)
 
1355         if not filters or filters is []:
 
1357         chf = ChainedFilter(JArray('object')(filters, Filter), op)
 
1360     def filtered_categories(self, tags):
 
1362         Return a list of tag categories, present in tags list.
 
1366             cats[t.category] = True