+
+ def get_tokens(self, searched, field='content', cached=None):
+ """returns tokens analyzed by a proper (for a field) analyzer
+ argument can be: StringReader, string/unicode, or tokens. In the last case
+ they will just be returned (so we can reuse tokens, if we don't change the analyzer)
+ """
+ if cached is not None and field in cached:
+ return cached[field]
+
+ if isinstance(searched, str) or isinstance(searched, unicode):
+ searched = StringReader(searched)
+ elif isinstance(searched, list):
+ return searched
+
+ searched.reset()
+ tokens = self.analyzer.reusableTokenStream(field, searched)
+ toks = []
+ while tokens.incrementToken():
+ cta = tokens.getAttribute(CharTermAttribute.class_)
+ toks.append(cta.toString())
+
+ if cached is not None:
+ cached[field] = toks
+
+ return toks
+
+ def fuzziness(self, fuzzy):
+ """Helper method to sanitize fuzziness"""
+ if not fuzzy:
+ return None
+ if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
+ return fuzzy
+ else:
+ return 0.5
+
+ def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
+ """
+ Return a PhraseQuery with a series of tokens.
+ """
+ if fuzzy:
+ phrase = MultiPhraseQuery()
+ for t in tokens:
+ term = Term(field, t)
+ fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
+ fuzzterms = []
+
+ while True:
+ ft = fuzzterm.term()
+ if ft:
+ fuzzterms.append(ft)
+ if not fuzzterm.next(): break
+ if fuzzterms:
+ phrase.add(JArray('object')(fuzzterms, Term))
+ else:
+ phrase.add(term)
+ else:
+ phrase = PhraseQuery()
+ phrase.setSlop(slop)
+ for t in tokens:
+ term = Term(field, t)
+ phrase.add(term)
+ return phrase
+
+ def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+ """
+ Returns term queries joined by boolean query.
+ modal - applies to boolean query
+ fuzzy - should the query by fuzzy.
+ """
+ q = BooleanQuery()
+ for t in tokens:
+ term = Term(field, t)
+ if fuzzy:
+ term = FuzzyQuery(term, self.fuzziness(fuzzy))
+ else:
+ term = TermQuery(term)
+ q.add(BooleanClause(term, modal))
+ return q
+
+ def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
+ filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
+ if filters is None: filters = []
+ if tokens_cache is None: tokens_cache = {}
+
+ tokens = self.get_tokens(searched, field, cached=tokens_cache)
+
+ query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
+ if book:
+ filters.append(self.term_filter(Term('is_book', 'true')))
+ top = self.searcher.search(query, self.chain_filters(filters), max_results)
+
+ return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
+
+ def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
+ filters=None, tokens_cache=None, boost=None, snippets=True):
+ if filters is None: filters = []
+ if tokens_cache is None: tokens_cache = {}
+
+ if book:
+ filters.append(self.term_filter(Term('is_book', 'true')))
+
+ query = BooleanQuery()
+
+ for fld in fields:
+ tokens = self.get_tokens(searched, fld, cached=tokens_cache)
+
+ query.add(BooleanClause(self.make_term_query(tokens, field=fld,
+ fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+ top = self.searcher.search(query, self.chain_filters(filters), max_results)
+
+ return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
+ snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
+
+ def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
+ """
+ Search for perfect book matches. Just see if the query matches with some author or title,
+ taking hints into account.
+ """
+ fields_to_search = ['authors', 'title']
+ only_in = None
+ if hint:
+ if not hint.should_search_for_book():
+ return []
+ fields_to_search = hint.just_search_in(fields_to_search)
+ only_in = hint.book_filter()
+
+ qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
+
+ books = []
+ for q in qrys:
+ top = self.searcher.search(q,
+ self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
+ max_results)
+ for found in top.scoreDocs:
+ books.append(SearchResult(self, found, how_found="search_perfect_book"))
+ return books
+
+ def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
+ fields_to_search = ['tags', 'authors', 'title']
+
+ only_in = None
+ if hint:
+ if not hint.should_search_for_book():
+ return []
+ fields_to_search = hint.just_search_in(fields_to_search)
+ only_in = hint.book_filter()
+
+ tokens = self.get_tokens(searched, field='SIMPLE')
+
+ q = BooleanQuery()
+
+ for fld in fields_to_search:
+ q.add(BooleanClause(self.make_term_query(tokens, field=fld,
+ fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+ books = []
+ top = self.searcher.search(q,
+ self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
+ max_results)
+ for found in top.scoreDocs:
+ books.append(SearchResult(self, found, how_found="search_book"))
+
+ return books
+
+ def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
+ """
+ Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
+ some part/fragment of the book.
+ """
+ qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
+
+ flt = None
+ if hint:
+ flt = hint.part_filter()
+
+ books = []
+ for q in qrys:
+ top = self.searcher.search(q,
+ self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
+ flt]),
+ max_results)
+ for found in top.scoreDocs:
+ books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
+
+ return books
+
+ def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
+ """
+ Tries to use search terms to match different fields of book (or its parts).
+ E.g. one word can be an author survey, another be a part of the title, and the rest
+ are some words from third chapter.
+ """
+ if tokens_cache is None: tokens_cache = {}
+ books = []
+ only_in = None
+
+ if hint:
+ only_in = hint.part_filter()
+
+ # content only query : themes x content
+ q = BooleanQuery()
+
+ tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
+ tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
+
+ # only search in themes when we do not already filter by themes
+ if hint is None or hint.just_search_in(['themes']) != []:
+ q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
+ fuzzy=fuzzy), BooleanClause.Occur.MUST))
+
+ q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
+ fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+ topDocs = self.searcher.search(q, only_in, max_results)
+ for found in topDocs.scoreDocs:
+ books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
+
+ # query themes/content x author/title/tags
+ q = BooleanQuery()
+ in_content = BooleanQuery()
+ in_meta = BooleanQuery()
+
+ for fld in ['themes_pl', 'content']:
+ in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+
+ for fld in ['tags', 'authors', 'title']:
+ in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+
+ q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
+ q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
+
+ topDocs = self.searcher.search(q, only_in, max_results)
+ for found in topDocs.scoreDocs:
+ books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
+
+ return books
+
+ # def multisearch(self, query, max_results=50):
+ # """
+ # Search strategy:
+ # - (phrase) OR -> content
+ # -> title
+ # -> authors
+ # - (keywords) -> authors
+ # -> motyw
+ # -> tags
+ # -> content
+ # """
+ # queryreader = StringReader(query)
+ # tokens = self.get_tokens(queryreader)
+
+ # top_level = BooleanQuery()
+ # Should = BooleanClause.Occur.SHOULD
+
+ # phrase_level = BooleanQuery()
+ # phrase_level.setBoost(1.3)
+
+ # p_content = self.make_phrase(tokens, joined=True)
+ # p_title = self.make_phrase(tokens, 'title')
+ # p_author = self.make_phrase(tokens, 'author')
+
+ # phrase_level.add(BooleanClause(p_content, Should))
+ # phrase_level.add(BooleanClause(p_title, Should))
+ # phrase_level.add(BooleanClause(p_author, Should))
+
+ # kw_level = BooleanQuery()
+
+ # kw_level.add(self.make_term_query(tokens, 'author'), Should)
+ # j_themes = self.make_term_query(tokens, 'themes', joined=True)
+ # kw_level.add(j_themes, Should)
+ # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
+ # j_con = self.make_term_query(tokens, joined=True)
+ # kw_level.add(j_con, Should)
+
+ # top_level.add(BooleanClause(phrase_level, Should))
+ # top_level.add(BooleanClause(kw_level, Should))
+
+ # return None
+
+ def get_snippets(self, scoreDoc, query, field='content'):
+ """
+ Returns a snippet for found scoreDoc.
+ """
+ htmlFormatter = SimpleHTMLFormatter()
+ highlighter = Highlighter(htmlFormatter, QueryScorer(query))
+
+ stored = self.searcher.doc(scoreDoc.doc)
+
+ position = stored.get('snippets_position')
+ length = stored.get('snippets_length')
+ if position is None or length is None:
+ return None
+ revision = stored.get('snippets_revision')
+ if revision: revision = int(revision)
+
+ # locate content.
+ book_id = int(stored.get('book_id'))
+ snippets = Snippets(book_id, revision=revision)
+
+ try:
+ snippets.open()
+ except IOError, e:
+ log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+ return []
+
+ try:
+ try:
+ text = snippets.get((int(position),
+ int(length)))
+ finally:
+ snippets.close()
+
+ tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
+ # highlighter.getBestTextFragments(tokenStream, text, False, 10)
+ snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+
+ except Exception, e:
+ e2 = e
+ if hasattr(e, 'getJavaException'):
+ e2 = unicode(e.getJavaException())
+ raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
+ e2)
+ return snip
+
+ @staticmethod
+ def enum_to_array(enum):
+ """
+ Converts a lucene TermEnum to array of Terms, suitable for
+ addition to queries
+ """
+ terms = []
+
+ while True:
+ t = enum.term()
+ if t:
+ terms.append(t)
+ if not enum.next(): break
+
+ if terms:
+ return JArray('object')(terms, Term)
+
+ def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
+ """
+ Search for Tag objects using query.
+ """
+ if not pdcounter:
+ filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
+ tops = self.searcher.search(query, filt, max_results)
+
+ tags = []
+ for found in tops.scoreDocs:
+ doc = self.searcher.doc(found.doc)
+ is_pdcounter = doc.get('is_pdcounter')
+ category = doc.get('tag_category')
+ try:
+ if is_pdcounter == 'true':
+ if category == 'pd_author':
+ tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
+ elif category == 'pd_book':
+ tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
+ tag.category = 'pd_book' # make it look more lik a tag.
+ else:
+ print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
+ else:
+ tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
+ # don't add the pdcounter tag if same tag already exists
+ if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
+ tags.append(tag)
+ except catalogue.models.Tag.DoesNotExist: pass
+ except PDCounterAuthor.DoesNotExist: pass
+ except PDCounterBook.DoesNotExist: pass
+
+ log.debug('search_tags: %s' % tags)
+
+ return tags
+
+ def search_books(self, query, filt=None, max_results=10):
+ """
+ Searches for Book objects using query
+ """
+ bks = []
+ tops = self.searcher.search(query, filt, max_results)
+ for found in tops.scoreDocs:
+ doc = self.searcher.doc(found.doc)
+ try:
+ bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+ except catalogue.models.Book.DoesNotExist: pass
+ return bks
+
+ def make_prefix_phrase(self, toks, field):
+ q = MultiPhraseQuery()
+ for i in range(len(toks)):
+ t = Term(field, toks[i])
+ if i == len(toks) - 1:
+ pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
+ if pterms:
+ q.add(pterms)
+ else:
+ q.add(t)
+ else:
+ q.add(t)
+ return q
+
+ @staticmethod
+ def term_filter(term, inverse=False):
+ only_term = TermsFilter()
+ only_term.addTerm(term)
+
+ if inverse:
+ neg = BooleanFilter()
+ neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
+ only_term = neg
+
+ return only_term
+
+ def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
+ """
+ Return auto-complete hints for tags
+ using prefix search.
+ """
+ toks = self.get_tokens(string, field='SIMPLE')
+ top = BooleanQuery()
+
+ for field in ['tag_name', 'tag_name_pl']:
+ if prefix:
+ q = self.make_prefix_phrase(toks, field)
+ else:
+ q = self.make_term_query(toks, field, fuzzy=fuzzy)
+ top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
+
+ no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
+
+ return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
+
+ def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
+ """
+ Returns auto-complete hints for book titles
+ Because we do not index 'pseudo' title-tags.
+ Prefix search.
+ """
+ toks = self.get_tokens(string, field='SIMPLE')
+
+ if prefix:
+ q = self.make_prefix_phrase(toks, 'title')
+ else:
+ q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
+
+ return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
+
+ @staticmethod
+ def chain_filters(filters, op=ChainedFilter.AND):
+ """
+ Chains a filter list together
+ """
+ filters = filter(lambda x: x is not None, filters)
+ if not filters or filters is []:
+ return None
+ chf = ChainedFilter(JArray('object')(filters, Filter), op)
+ return chf
+
+ def filtered_categories(self, tags):
+ """
+ Return a list of tag categories, present in tags list.
+ """
+ cats = {}
+ for t in tags:
+ cats[t.category] = True
+ return cats.keys()
+
+ def hint(self):
+ return Hint(self)