KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
BlockJoinQuery, BlockJoinCollector, TermsFilter, \
HashSet, BooleanClause, Term, CharTermAttribute, \
- PhraseQuery, StringReader
+ PhraseQuery, StringReader, TermQuery
# KeywordAnalyzer
+import sys
import os
import errno
from librarian import dcparser
from librarian.parser import WLDocument
import catalogue.models
+from multiprocessing.pool import ThreadPool
import atexit
def index_book(self, book, overwrite=True):
if overwrite:
self.remove_book(book)
+
doc = self.extract_metadata(book)
parts = self.extract_content(book)
if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
"""
index = None
- def open(self, analyzer=None):
+ pool = None
+ pool_jobs = None
+
+ def open(self, analyzer=None, threads=4):
if ReusableIndex.index is not None:
self.index = ReusableIndex.index
else:
- Index.open(self,analyzer)
+ ReusableIndex.pool = ThreadPool(threads)
+ ReusableIndex.pool_jobs = []
+ Index.open(self, analyzer)
ReusableIndex.index = self.index
atexit.register(ReusableIndex.close_reusable)
+ def index_book(self, *args, **kw):
+ job = ReusableIndex.pool.apply_async(Index.index_book, (self,)+ args, kw)
+ ReusableIndex.pool_jobs.append(job)
+
@staticmethod
def close_reusable():
if ReusableIndex.index is not None:
+ for job in ReusableIndex.pool_jobs:
+ job.wait()
+ ReusableIndex.pool.close()
+
ReusableIndex.index.optimize()
ReusableIndex.index.close()
ReusableIndex.index = None
-
+
def close(self):
pass
+
class Search(IndexStore):
def __init__(self, default_field="content"):
IndexStore.__init__(self)
toks = []
while tokens.incrementToken():
cta = tokens.getAttribute(CharTermAttribute.class_)
- toks.append(cta)
+ toks.append(cta.toString())
return toks
- def make_phrase(self, tokens, field='content', joined=False):
+ def make_phrase(self, tokens, field='content', joined=False, slop=2):
phrase = PhraseQuery()
+ phrase.setSlop(slop)
for t in tokens:
term = Term(field, t)
phrase.add(term)
q = BooleanQuery()
for t in tokens:
term = Term(field, t)
- q.add(BooleanClause(term, modal))
+ q.add(BooleanClause(TermQuery(term), modal))
if joined:
self.content_query(q)
return q
return BlockJoinQuery(query, self.parent_filter,
BlockJoinQuery.ScoreMode.Total)
- def multiseach(self, query, max_results=50):
+ def multisearch(self, query, max_results=50):
"""
Search strategy:
- (phrase) OR -> content
Should = BooleanClause.Occur.SHOULD
phrase_level = BooleanQuery()
+ phrase_level.setBoost(1.3)
p_content = self.make_phrase(tokens, joined=True)
p_title = self.make_phrase(tokens, 'title')
top_level.add(BooleanClause(phrase_level, Should))
top_level.add(BooleanClause(kw_level, Should))
- tops = self.searcher.search(top_level, max_results)
+ print self.do_search(phrase_level)
+ print self.do_search(kw_level)
+ print self.do_search(top_level)
+
+ def do_search(self, query, max_results=50):
+ tops = self.searcher.search(query, max_results)
+ #tops = self.searcher.search(p_content, max_results)
+
bks = []
for found in tops.scoreDocs:
doc = self.searcher.doc(found.doc)
- bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+ b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
+ bks.append(b)
+ print "%s (%d) -> %f" % (b, b.id, found.score)
return (bks, tops.totalHits)