- if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
- """
- index = None
- pool = None
- pool_jobs = None
-
- def open(self, analyzer=None, threads=4):
- if ReusableIndex.index is not None:
- self.index = ReusableIndex.index
- else:
- print("opening index")
- ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
- ReusableIndex.pool_jobs = []
- Index.open(self, analyzer)
- ReusableIndex.index = self.index
- atexit.register(ReusableIndex.close_reusable)
-
- def index_book(self, *args, **kw):
- job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
- ReusableIndex.pool_jobs.append(job)
-
- @staticmethod
- def close_reusable():
- if ReusableIndex.index is not None:
- print("wait for indexing to finish")
- for job in ReusableIndex.pool_jobs:
- job.get()
- sys.stdout.write('.')
- sys.stdout.flush()
- print("done.")
- ReusableIndex.pool.close()
-
- ReusableIndex.index.optimize()
- ReusableIndex.index.close()
- ReusableIndex.index = None
-
- def close(self):
- pass
-
-
-class Search(IndexStore):
- def __init__(self, default_field="content"):
- IndexStore.__init__(self)
- self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
- ## self.analyzer = WLAnalyzer()
- self.searcher = IndexSearcher(self.store, True)
- self.parser = QueryParser(Version.LUCENE_34, default_field,
- self.analyzer)
-
- self.parent_filter = TermsFilter()
- self.parent_filter.addTerm(Term("is_book", "true"))
-
- def query(self, query):
- return self.parser.parse(query)
-
- def wrapjoins(self, query, fields=[]):
- """
- This functions modifies the query in a recursive way,
- so Term and Phrase Queries contained, which match
- provided fields are wrapped in a BlockJoinQuery,
- and so delegated to children documents.
- """
- if BooleanQuery.instance_(query):
- qs = BooleanQuery.cast_(query)
- for clause in qs:
- clause = BooleanClause.cast_(clause)
- clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
- return qs
- else:
- termset = HashSet()
- query.extractTerms(termset)
- for t in termset:
- t = Term.cast_(t)
- if t.field() not in fields:
- return query
- return BlockJoinQuery(query, self.parent_filter,
- BlockJoinQuery.ScoreMode.Total)
-
- def simple_search(self, query, max_results=50):
- """Returns (books, total_hits)
- """
-
- tops = self.searcher.search(self.query(query), max_results)
- bks = []
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
- bks.append(models.Book.objects.get(id=doc.get("book_id")))
- return (bks, tops.totalHits)
-
- def search(self, query, max_results=50):
- query = self.query(query)
- query = self.wrapjoins(query, ["content", "themes"])
-
- tops = self.searcher.search(query, max_results)
- bks = []
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
- bks.append(models.Book.objects.get(id=doc.get("book_id")))
- return (bks, tops.totalHits)
-
- def bsearch(self, query, max_results=50):
- q = self.query(query)
- bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
-
- tops = self.searcher.search(bjq, max_results)
- bks = []
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
- bks.append(models.Book.objects.get(id=doc.get("book_id")))
- return (bks, tops.totalHits)
-
-# TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
-# OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
-# CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
-
-# while (tokenStream.incrementToken()) {
-# int startOffset = offsetAttribute.startOffset();
-# int endOffset = offsetAttribute.endOffset();
-# String term = charTermAttribute.toString();
-# }