+
+ def get_tokens(self, searched, field='content'):
+ """returns tokens analyzed by a proper (for a field) analyzer
+ argument can be: StringReader, string/unicode, or tokens. In the last case
+ they will just be returned (so we can reuse tokens, if we don't change the analyzer)
+ """
+ if isinstance(searched, str) or isinstance(searched, unicode):
+ searched = StringReader(searched)
+ elif isinstance(searched, list):
+ return searched
+
+ searched.reset()
+ tokens = self.analyzer.reusableTokenStream(field, searched)
+ toks = []
+ while tokens.incrementToken():
+ cta = tokens.getAttribute(CharTermAttribute.class_)
+ toks.append(cta.toString())
+ return toks
+
+ def fuzziness(self, fuzzy):
+ """Helper method to sanitize fuzziness"""
+ if not fuzzy:
+ return None
+ if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
+ return fuzzy
+ else:
+ return 0.5
+
+ def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
+ """
+ Return a PhraseQuery with a series of tokens.
+ """
+ if fuzzy:
+ phrase = MultiPhraseQuery()
+ for t in tokens:
+ term = Term(field, t)
+ fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
+ fuzzterms = []
+
+ while True:
+ # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
+ ft = fuzzterm.term()
+ if ft:
+ fuzzterms.append(ft)
+ if not fuzzterm.next(): break
+ if fuzzterms:
+ phrase.add(JArray('object')(fuzzterms, Term))
+ else:
+ phrase.add(term)
+ else:
+ phrase = PhraseQuery()
+ phrase.setSlop(slop)
+ for t in tokens:
+ term = Term(field, t)
+ phrase.add(term)
+ return phrase
+
+ def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+ """
+ Returns term queries joined by boolean query.
+ modal - applies to boolean query
+ fuzzy - should the query by fuzzy.
+ """
+ q = BooleanQuery()
+ for t in tokens:
+ term = Term(field, t)
+ if fuzzy:
+ term = FuzzyQuery(term, self.fuzziness(fuzzy))
+ else:
+ term = TermQuery(term)
+ q.add(BooleanClause(term, modal))
+ return q
+
+ # def content_query(self, query):
+ # return BlockJoinQuery(query, self.parent_filter,
+ # BlockJoinQuery.ScoreMode.Total)
+
+ def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
+ """
+ Search for perfect book matches. Just see if the query matches with some author or title,
+ taking hints into account.
+ """
+ fields_to_search = ['authors', 'title']
+ only_in = None
+ if hint:
+ if not hint.should_search_for_book():
+ return []
+ fields_to_search = hint.just_search_in(fields_to_search)
+ only_in = hint.book_filter()
+
+ qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
+
+ books = []
+ for q in qrys:
+ top = self.searcher.search(q,
+ self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
+ max_results)
+ for found in top.scoreDocs:
+ books.append(SearchResult(self.searcher, found))
+ return books
+
+ def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
+ fields_to_search = ['tags', 'authors', 'title']
+
+ only_in = None
+ if hint:
+ if not hint.should_search_for_book():
+ return []
+ fields_to_search = hint.just_search_in(fields_to_search)
+ only_in = hint.book_filter()
+
+ tokens = self.get_tokens(searched, field='SIMPLE')
+
+ q = BooleanQuery()
+
+ for fld in fields_to_search:
+ q.add(BooleanClause(self.make_term_query(tokens, field=fld,
+ fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+ books = []
+ top = self.searcher.search(q,
+ self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
+ max_results)
+ for found in top.scoreDocs:
+ books.append(SearchResult(self.searcher, found))
+
+ return books
+
+ def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
+ """
+ Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
+ some part/fragment of the book.
+ """
+ qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
+
+ flt = None
+ if hint:
+ flt = hint.part_filter()
+
+ books = []
+ for q in qrys:
+ top = self.searcher.search(q,
+ self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
+ flt]),
+ max_results)
+ for found in top.scoreDocs:
+ books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
+
+ return books
+
+ def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
+ """
+ Tries to use search terms to match different fields of book (or its parts).
+ E.g. one word can be an author survey, another be a part of the title, and the rest
+ are some words from third chapter.
+ """
+ books = []
+ only_in = None
+
+ if hint:
+ only_in = hint.part_filter()
+
+ # content only query : themes x content
+ q = BooleanQuery()
+
+ tokens_pl = self.get_tokens(searched, field='content')
+ tokens = self.get_tokens(searched, field='SIMPLE')
+
+ # only search in themes when we do not already filter by themes
+ if hint is None or hint.just_search_in(['themes']) != []:
+ q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
+ fuzzy=fuzzy), BooleanClause.Occur.MUST))
+
+ q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
+ fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+ topDocs = self.searcher.search(q, only_in, max_results)
+ for found in topDocs.scoreDocs:
+ books.append(SearchResult(self.searcher, found))
+ print "* %s theme x content: %s" % (searched, books[-1]._hits)
+
+ # query themes/content x author/title/tags
+ q = BooleanQuery()
+ in_content = BooleanQuery()
+ in_meta = BooleanQuery()
+
+ for fld in ['themes_pl', 'content']:
+ in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+
+ for fld in ['tags', 'authors', 'title']:
+ in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+
+ q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
+ q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
+
+ topDocs = self.searcher.search(q, only_in, max_results)
+ for found in topDocs.scoreDocs:
+ books.append(SearchResult(self.searcher, found))
+ print "* %s scatter search: %s" % (searched, books[-1]._hits)
+
+ return books
+
+ # def multisearch(self, query, max_results=50):
+ # """
+ # Search strategy:
+ # - (phrase) OR -> content
+ # -> title
+ # -> authors
+ # - (keywords) -> authors
+ # -> motyw
+ # -> tags
+ # -> content
+ # """
+ # queryreader = StringReader(query)
+ # tokens = self.get_tokens(queryreader)
+
+ # top_level = BooleanQuery()
+ # Should = BooleanClause.Occur.SHOULD
+
+ # phrase_level = BooleanQuery()
+ # phrase_level.setBoost(1.3)
+
+ # p_content = self.make_phrase(tokens, joined=True)
+ # p_title = self.make_phrase(tokens, 'title')
+ # p_author = self.make_phrase(tokens, 'author')
+
+ # phrase_level.add(BooleanClause(p_content, Should))
+ # phrase_level.add(BooleanClause(p_title, Should))
+ # phrase_level.add(BooleanClause(p_author, Should))
+
+ # kw_level = BooleanQuery()
+
+ # kw_level.add(self.make_term_query(tokens, 'author'), Should)
+ # j_themes = self.make_term_query(tokens, 'themes', joined=True)
+ # kw_level.add(j_themes, Should)
+ # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
+ # j_con = self.make_term_query(tokens, joined=True)
+ # kw_level.add(j_con, Should)
+
+ # top_level.add(BooleanClause(phrase_level, Should))
+ # top_level.add(BooleanClause(kw_level, Should))
+
+ # return None
+
+ def get_snippets(self, scoreDoc, query, field='content'):
+ """
+ Returns a snippet for found scoreDoc.
+ """
+ htmlFormatter = SimpleHTMLFormatter()
+ highlighter = Highlighter(htmlFormatter, QueryScorer(query))
+
+ stored = self.searcher.doc(scoreDoc.doc)
+
+ # locate content.
+ snippets = Snippets(stored.get('book_id')).open()
+ try:
+ text = snippets.get((int(stored.get('snippets_position')),
+ int(stored.get('snippets_length'))))
+ finally:
+ snippets.close()
+
+ tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
+ # highlighter.getBestTextFragments(tokenStream, text, False, 10)
+ snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+
+ return snip
+
+ @staticmethod
+ def enum_to_array(enum):
+ """
+ Converts a lucene TermEnum to array of Terms, suitable for
+ addition to queries
+ """
+ terms = []
+
+ while True:
+ t = enum.term()
+ if t:
+ terms.append(t)
+ if not enum.next(): break
+
+ if terms:
+ return JArray('object')(terms, Term)
+
+ def search_tags(self, query, filter=None, max_results=40):
+ """
+ Search for Tag objects using query.
+ """
+ tops = self.searcher.search(query, filter, max_results)
+
+ tags = []
+ for found in tops.scoreDocs:
+ doc = self.searcher.doc(found.doc)
+ tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
+ tags.append(tag)
+ print "%s (%d) -> %f" % (tag, tag.id, found.score)
+
+ return tags
+
+ def search_books(self, query, filter=None, max_results=10):
+ """
+ Searches for Book objects using query
+ """
+ bks = []
+ tops = self.searcher.search(query, filter, max_results)
+ for found in tops.scoreDocs:
+ doc = self.searcher.doc(found.doc)
+ bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+ return bks
+
+ def create_prefix_phrase(self, toks, field):
+ q = MultiPhraseQuery()
+ for i in range(len(toks)):
+ t = Term(field, toks[i])
+ if i == len(toks) - 1:
+ pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
+ if pterms:
+ q.add(pterms)
+ else:
+ q.add(t)
+ else:
+ q.add(t)
+ return q
+
+ @staticmethod
+ def term_filter(term, inverse=False):
+ only_term = TermsFilter()
+ only_term.addTerm(term)
+
+ if inverse:
+ neg = BooleanFilter()
+ neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
+ only_term = neg
+
+ return only_term
+
+ def hint_tags(self, string, max_results=50):
+ """
+ Return auto-complete hints for tags
+ using prefix search.
+ """
+ toks = self.get_tokens(string, field='SIMPLE')
+ top = BooleanQuery()
+
+ for field in ['tag_name', 'tag_name_pl']:
+ q = self.create_prefix_phrase(toks, field)
+ top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
+
+ no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
+
+ return self.search_tags(top, no_book_cat, max_results=max_results)
+
+ def hint_books(self, string, max_results=50):
+ """
+ Returns auto-complete hints for book titles
+ Because we do not index 'pseudo' title-tags.
+ Prefix search.
+ """
+ toks = self.get_tokens(string, field='SIMPLE')
+
+ q = self.create_prefix_phrase(toks, 'title')
+
+ return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
+
+ @staticmethod
+ def chain_filters(filters, op=ChainedFilter.AND):
+ """
+ Chains a filter list together
+ """
+ filters = filter(lambda x: x is not None, filters)
+ if not filters:
+ return None
+ chf = ChainedFilter(JArray('object')(filters, Filter), op)
+ return chf
+
+ def filtered_categories(self, tags):
+ """
+ Return a list of tag categories, present in tags list.
+ """
+ cats = {}
+ for t in tags:
+ cats[t.category] = True
+ return cats.keys()
+
+ def hint(self):
+ return Hint(self)