KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
BlockJoinQuery, BlockJoinCollector, TermsFilter, \
HashSet, BooleanClause, Term, CharTermAttribute, \
- PhraseQuery, StringReader
+ PhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
+ Sort
# KeywordAnalyzer
import sys
import os
from librarian.parser import WLDocument
import catalogue.models
from multiprocessing.pool import ThreadPool
+from threading import current_thread
import atexit
if overwrite:
self.remove_book(book)
+
doc = self.extract_metadata(book)
parts = self.extract_content(book)
block = ArrayList().of_(Document)
def extract_metadata(self, book):
book_info = dcparser.parse(book.xml_file)
+ print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
+
doc = self.create_book_doc(book)
doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
if ReusableIndex.index is not None:
self.index = ReusableIndex.index
else:
+ print("opening index")
ReusableIndex.pool = ThreadPool(threads)
ReusableIndex.pool_jobs = []
Index.open(self, analyzer)
atexit.register(ReusableIndex.close_reusable)
def index_book(self, *args, **kw):
- job = ReusableIndex.pool.apply_async(Index.index_book, args, kw)
+ job = ReusableIndex.pool.apply_async(Index.index_book, (self,)+ args, kw)
ReusableIndex.pool_jobs.append(job)
@staticmethod
def close_reusable():
if ReusableIndex.index is not None:
- all_jobs = len(ReusableIndex.pool_jobs)
- waited=1
+ print("closing index")
for job in ReusableIndex.pool_jobs:
- sys.stdout.write("\rWaiting for search index job: %d/%d..." %
job.wait()
- waited+=1
- print("Indexing done.")
ReusableIndex.pool.close()
ReusableIndex.index.optimize()
toks = []
while tokens.incrementToken():
cta = tokens.getAttribute(CharTermAttribute.class_)
- toks.append(cta)
+ toks.append(cta.toString())
return toks
- def make_phrase(self, tokens, field='content', joined=False):
+ def make_phrase(self, tokens, field='content', joined=False, slop=2):
phrase = PhraseQuery()
+ phrase.setSlop(slop)
for t in tokens:
term = Term(field, t)
phrase.add(term)
q = BooleanQuery()
for t in tokens:
term = Term(field, t)
- q.add(BooleanClause(term, modal))
+ q.add(BooleanClause(TermQuery(term), modal))
if joined:
- self.content_query(q)
+ q = self.content_query(q)
return q
def content_query(self, query):
return BlockJoinQuery(query, self.parent_filter,
BlockJoinQuery.ScoreMode.Total)
- def multiseach(self, query, max_results=50):
+ def multisearch(self, query, max_results=50):
"""
Search strategy:
- (phrase) OR -> content
Should = BooleanClause.Occur.SHOULD
phrase_level = BooleanQuery()
+ phrase_level.setBoost(1.3)
p_content = self.make_phrase(tokens, joined=True)
- p_title = self.make_phrase(tokens, 'title')
+ p_title = self.makxe_phrase(tokens, 'title')
p_author = self.make_phrase(tokens, 'author')
phrase_level.add(BooleanClause(p_content, Should))
kw_level = BooleanQuery()
kw_level.add(self.make_term_query(tokens, 'author'), Should)
- kw_level.add(self.make_term_query(tokens, 'themes', joined=True), Should)
+ j_themes = self.make_term_query(tokens, 'themes', joined=True)
+ kw_level.add(j_themes, Should)
kw_level.add(self.make_term_query(tokens, 'tags'), Should)
- kw_level.add(self.make_term_query(tokens, joined=True), Should)
+ j_con = self.make_term_query(tokens, joined=True)
+ kw_level.add(j_con, Should)
top_level.add(BooleanClause(phrase_level, Should))
top_level.add(BooleanClause(kw_level, Should))
- tops = self.searcher.search(top_level, max_results)
+ collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
+
+ self.searcher.search(kw_level, collector)
+
+ # frazy w treści:
+ # ph1 = collector.getTopGroups(j_themes, Sort.RELEVANCE,
+ # 0, 10, 0, True)
+ # reload(search.index); realod(search); s = search.MultiSearch(); s.multisearch(u'dusiołek')
+ # ph2 = collector.getTopGroups(j_con, Sort.RELEVANCE,
+ # 0, 10, 0, True)
+
+ import pdb; pdb.set_trace();
+
+ return None
+
+
+ def do_search(self, query, max_results=50, collector=None):
+ tops = self.searcher.search(query, max_results)
+ #tops = self.searcher.search(p_content, max_results)
+
bks = []
for found in tops.scoreDocs:
doc = self.searcher.doc(found.doc)
- bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+ b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
+ bks.append(b)
+ print "%s (%d) -> %f" % (b, b.id, found.score)
return (bks, tops.totalHits)