X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/18e520abab50a7e37eff7b09b9754ed32899811b..5c3f2f665243a32510f8097bf6c0582fc14d2fd3:/apps/search/index.py diff --git a/apps/search/index.py b/apps/search/index.py index 33f2aa1aa..42a271e02 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -7,13 +7,15 @@ from lucene import SimpleFSDirectory, IndexWriter, File, Field, \ KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \ BlockJoinQuery, BlockJoinCollector, TermsFilter, \ HashSet, BooleanClause, Term, CharTermAttribute, \ - PhraseQuery, StringReader + PhraseQuery, StringReader, TermQuery # KeywordAnalyzer +import sys import os import errno from librarian import dcparser from librarian.parser import WLDocument import catalogue.models +from multiprocessing.pool import ThreadPool import atexit @@ -80,6 +82,7 @@ class Index(IndexStore): def index_book(self, book, overwrite=True): if overwrite: self.remove_book(book) + doc = self.extract_metadata(book) parts = self.extract_content(book) @@ -247,24 +250,38 @@ class ReusableIndex(Index): if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself. """ index = None - def open(self, analyzer=None): + pool = None + pool_jobs = None + + def open(self, analyzer=None, threads=4): if ReusableIndex.index is not None: self.index = ReusableIndex.index else: - Index.open(self,analyzer) + ReusableIndex.pool = ThreadPool(threads) + ReusableIndex.pool_jobs = [] + Index.open(self, analyzer) ReusableIndex.index = self.index atexit.register(ReusableIndex.close_reusable) + def index_book(self, *args, **kw): + job = ReusableIndex.pool.apply_async(Index.index_book, (self,)+ args, kw) + ReusableIndex.pool_jobs.append(job) + @staticmethod def close_reusable(): if ReusableIndex.index is not None: + for job in ReusableIndex.pool_jobs: + job.wait() + ReusableIndex.pool.close() + ReusableIndex.index.optimize() ReusableIndex.index.close() ReusableIndex.index = None - + def close(self): pass + class Search(IndexStore): def __init__(self, default_field="content"): IndexStore.__init__(self) @@ -357,11 +374,12 @@ class MultiSearch(Search): toks = [] while tokens.incrementToken(): cta = tokens.getAttribute(CharTermAttribute.class_) - toks.append(cta) + toks.append(cta.toString()) return toks - def make_phrase(self, tokens, field='content', joined=False): + def make_phrase(self, tokens, field='content', joined=False, slop=2): phrase = PhraseQuery() + phrase.setSlop(slop) for t in tokens: term = Term(field, t) phrase.add(term) @@ -373,7 +391,7 @@ class MultiSearch(Search): q = BooleanQuery() for t in tokens: term = Term(field, t) - q.add(BooleanClause(term, modal)) + q.add(BooleanClause(TermQuery(term), modal)) if joined: self.content_query(q) return q @@ -382,7 +400,7 @@ class MultiSearch(Search): return BlockJoinQuery(query, self.parent_filter, BlockJoinQuery.ScoreMode.Total) - def multiseach(self, query, max_results=50): + def multisearch(self, query, max_results=50): """ Search strategy: - (phrase) OR -> content @@ -400,6 +418,7 @@ class MultiSearch(Search): Should = BooleanClause.Occur.SHOULD phrase_level = BooleanQuery() + phrase_level.setBoost(1.3) p_content = self.make_phrase(tokens, joined=True) p_title = self.make_phrase(tokens, 'title') @@ -419,9 +438,18 @@ class MultiSearch(Search): top_level.add(BooleanClause(phrase_level, Should)) top_level.add(BooleanClause(kw_level, Should)) - tops = self.searcher.search(top_level, max_results) + print self.do_search(phrase_level) + print self.do_search(kw_level) + print self.do_search(top_level) + + def do_search(self, query, max_results=50): + tops = self.searcher.search(query, max_results) + #tops = self.searcher.search(p_content, max_results) + bks = [] for found in tops.scoreDocs: doc = self.searcher.doc(found.doc) - bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + b = catalogue.models.Book.objects.get(id=doc.get("book_id")) + bks.append(b) + print "%s (%d) -> %f" % (b, b.id, found.score) return (bks, tops.totalHits)