# -*- coding: utf-8 -*-
+
from django.conf import settings
from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
NumericField, Version, Document, JavaError, IndexSearcher, \
BlockJoinQuery, BlockJoinCollector, TermsFilter, \
HashSet, BooleanClause, Term, CharTermAttribute, \
PhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
- Sort, Integer
+ Sort, Integer, \
+ initVM, CLASSPATH
# KeywordAnalyzer
+JVM = initVM(CLASSPATH)
import sys
import os
import errno
from multiprocessing.pool import ThreadPool
from threading import current_thread
import atexit
-
+import traceback
class WLAnalyzer(PerFieldAnalyzerWrapper):
def __init__(self):
if overwrite:
self.remove_book(book)
-
doc = self.extract_metadata(book)
parts = self.extract_content(book)
block = ArrayList().of_(Document)
self.close()
+def log_exception_wrapper(f):
+ def _wrap(*a):
+ try:
+ f(*a)
+ except Exception, e:
+ print("Error in indexing thread: %s" % e)
+ traceback.print_exc()
+ raise e
+ return _wrap
+
+
class ReusableIndex(Index):
"""
Works like index, but does not close/optimize Lucene index
self.index = ReusableIndex.index
else:
print("opening index")
- ReusableIndex.pool = ThreadPool(threads)
+ ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
ReusableIndex.pool_jobs = []
Index.open(self, analyzer)
ReusableIndex.index = self.index
atexit.register(ReusableIndex.close_reusable)
def index_book(self, *args, **kw):
- job = ReusableIndex.pool.apply_async(Index.index_book, (self,)+ args, kw)
+ job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
ReusableIndex.pool_jobs.append(job)
@staticmethod
if ReusableIndex.index is not None:
print("closing index")
for job in ReusableIndex.pool_jobs:
- job.wait()
+ job.get()
+ sys.stdout.write('.')
+ sys.stdout.flush()
ReusableIndex.pool.close()
ReusableIndex.index.optimize()
# }
+class SearchResult(object):
+ def __init__(self, searcher, scoreDocs, score=None):
+ if score:
+ self.score = score
+ else:
+ self.score = scoreDocs.score
+
+ self.fragments = []
+ self.scores = {}
+ self.sections = []
+
+ stored = searcher.doc(scoreDocs.doc)
+ self.book_id = int(stored.get("book_id"))
+
+ fragment = stored.get("fragment_anchor")
+ if fragment:
+ self.fragments.append(fragment)
+ self.scores[fragment] = scoreDocs.score
+
+ header_type = stored.get("header_type")
+ if header_type:
+ sec = (header_type, int(stored.get("header_index")))
+ self.sections.append(sec)
+ self.scores[sec] = scoreDocs.score
+
+ def get_book(self):
+ return catalogue.models.Book.objects.get(id=self.book_id)
+
+ book = property(get_book)
+
+ def get_parts(self):
+ book = self.book
+ parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
+ + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
+
+ parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
+ print("bookid: %d parts: %s" % (self.book_id, parts))
+ return parts
+
+ parts = property(get_parts)
+
+ def merge(self, other):
+ if self.book_id != other.book_id:
+ raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
+ self.fragments += other.fragments
+ self.sections += other.sections
+ self.scores.update(other.scores)
+ if other.score > self.score:
+ self.score = other.score
+ return self
+
+ def __unicode__(self):
+ return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
+
+ @staticmethod
+ def aggregate(*result_lists):
+ books = {}
+ for rl in result_lists:
+ for r in rl:
+ if r.book_id in books:
+ books[r.book_id].merge(r)
+ #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
+ else:
+ books[r.book_id] = r
+ return books.values()
+
+ def __cmp__(self, other):
+ return cmp(self.score, other.score)
+
+
class MultiSearch(Search):
"""Class capable of IMDb-like searching"""
def get_tokens(self, queryreader):
return BlockJoinQuery(query, self.parent_filter,
BlockJoinQuery.ScoreMode.Total)
- def search_perfect(self, tokens, max_results=20):
- qrys = [self.make_phrase(tokens, field=fld) for fld in ['author', 'title', 'content']]
+ def search_perfect_book(self, tokens, max_results=20):
+ qrys = [self.make_phrase(tokens, field=fld) for fld in ['author', 'title']]
books = []
for q in qrys:
top = self.searcher.search(q, max_results)
for found in top.scoreDocs:
- book_info = self.searcher.doc(found.doc)
- books.append((found.score, catalogue.models.Book.objects.get(id=book_info.get("book_id")), []))
+ books.append(SearchResult(self.searcher, found))
+ return books
+
+ def search_perfect_parts(self, tokens, max_results=20):
+ qrys = [self.make_phrase(tokens, field=fld) for fld in ['content']]
+
+ books = []
+ for q in qrys:
+ top = self.searcher.search(q, max_results)
+ for found in top.scoreDocs:
+ books.append(SearchResult(self.searcher, found))
+
return books
def search_everywhere(self, tokens, max_results=20):
top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
if top_groups:
for grp in top_groups.groups:
- doc_id = Integer.cast_(grp.groupValue).intValue()
- book_data = self.searcher.doc(doc_id)
- book = catalogue.models.Book.objects.get(id=book_data.get("book_id"))
- parts = []
for part in grp.scoreDocs:
- part_data = self.searcher.doc(part.doc)
- header_type = part_data.get("header_type")
- if header_type:
- parts.append((part.score, {"header": header_type, "position": int(part_data.get("header_index"))}))
- fragment = part_data.get("fragment_anchor")
- if fragment:
- fragment = book.fragments.get(anchor=fragment)
- parts.append((part.score, {"fragment": fragment}))
- books.append((grp.maxScore, book, parts))
-
+ books.append(SearchResult(self.searcher, part, score=grp.maxScore))
return books
-
def multisearch(self, query, max_results=50):
"""
Search strategy: