# KeywordAnalyzer
# Initialize jvm
-JVM = initVM(CLASSPATH)
+JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP)
import sys
import os
from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
from multiprocessing.pool import ThreadPool
from threading import current_thread
+from itertools import chain
import atexit
import traceback
-
+import logging
+log = logging.getLogger('search')
class WLAnalyzer(PerFieldAnalyzerWrapper):
def __init__(self):
if not os.path.exists(self.path):
break
self.revision += 1
- print "using %s" % self.path
self.file = open(self.path, mode)
self.position = 0
try:
self.index.optimize()
except JavaError, je:
- print "Error during optimize phase, check index: %s" % je
+ log.error("Error during optimize phase, check index: %s" % je)
self.index.close()
self.index = None
if not remove_only:
# then add them [all or just one passed]
if not tags:
- tags = catalogue.models.Tag.objects.exclude(category='set') + \
- PDCounterAuthor.objects.all() + \
- PDCounterBook.objects.all()
+ tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
+ PDCounterAuthor.objects.all(), \
+ PDCounterBook.objects.all())
for tag in tags:
if isinstance(tag, PDCounterAuthor):
doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
return doc
- def remove_book(self, book, remove_snippets=True):
+ def remove_book(self, book_or_id, remove_snippets=True):
"""Removes a book from search index.
book - Book instance."""
- q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
+ if isinstance(book_or_id, catalogue.models.Book):
+ book_id = book_or_id.id
+ else:
+ book_id = book_or_id
+
+ q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
self.index.deleteDocuments(q)
if remove_snippets:
- snippets = Snippets(book.id)
+ snippets = Snippets(book_id)
snippets.remove()
def index_book(self, book, book_info=None, overwrite=True):
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
- meta_fields = self.extract_metadata(book, book_info)
+ meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+ # let's not index it - it's only used for extracting publish date
+ if 'source_name' in meta_fields:
+ del meta_fields['source_name']
+
for f in meta_fields.values():
if isinstance(f, list) or isinstance(f, tuple):
for elem in f:
published_date_re = re.compile("([0-9]+)[\]. ]*$")
- def extract_metadata(self, book, book_info=None):
+ def extract_metadata(self, book, book_info=None, dc_only=None):
"""
Extract metadata from book and returns a map of fields keyed by fieldname
"""
# validator, name
for field in dcparser.BookInfo.FIELDS:
+ if dc_only and field.name not in dc_only:
+ continue
if hasattr(book_info, field.name):
if not getattr(book_info, field.name):
continue
.setIntValue('header_span' in fields and fields['header_span'] or 1))
doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
- print ">>[%s]>%s<<<" % (fields.get('fragment_anchor', ''), fields['content'])
-
doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
Field.TermVector.WITH_POSITIONS_OFFSETS))
try:
f(*a)
except Exception, e:
- print("Error in indexing thread: %s" % e)
+ log.error("Error in indexing thread: %s" % e)
traceback.print_exc()
raise e
return _wrap
if ReusableIndex.index:
self.index = ReusableIndex.index
else:
- print("opening index")
Index.open(self, analyzer, **kw)
ReusableIndex.index = self.index
atexit.register(ReusableIndex.close_reusable)
@staticmethod
def close_reusable():
if ReusableIndex.index:
- print("closing index")
ReusableIndex.index.optimize()
ReusableIndex.index.close()
ReusableIndex.index = None
# remove fragments with duplicated fid's and duplicated snippets
frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
- frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or hash(f),
+ frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
lambda a, b: cmp(a[SCORE], b[SCORE]))
# remove duplicate sections
for r in rl:
if r.book_id in books:
books[r.book_id].merge(r)
- #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
else:
books[r.book_id] = r
return books.values()
def reopen(self, **unused):
reader = self.searcher.getIndexReader()
rdr = reader.reopen()
- print "got signal to reopen index"
if not rdr.equals(reader):
- print "will reopen index"
+ log.debug('Reopening index')
oldsearch = self.searcher
self.searcher = IndexSearcher(rdr)
oldsearch.close()
return toks
- def fuzziness(self, fuzzy):
+ @staticmethod
+ def fuzziness(fuzzy):
"""Helper method to sanitize fuzziness"""
if not fuzzy:
return None
fuzzterms = []
while True:
- # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
ft = fuzzterm.term()
if ft:
fuzzterms.append(ft)
phrase.add(term)
return phrase
- def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+ @staticmethod
+ def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
topDocs = self.searcher.search(q, only_in, max_results)
for found in topDocs.scoreDocs:
books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
- print "* %s theme x content: %s" % (searched, books[-1]._hits)
# query themes/content x author/title/tags
q = BooleanQuery()
topDocs = self.searcher.search(q, only_in, max_results)
for found in topDocs.scoreDocs:
books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
- print "* %s scatter search: %s" % (searched, books[-1]._hits)
return books
return None
revision = stored.get('snippets_revision')
if revision: revision = int(revision)
+
# locate content.
book_id = int(stored.get('book_id'))
- snippets = Snippets(book_id, revision=revision).open()
+ snippets = Snippets(book_id, revision=revision)
+
+ try:
+ snippets.open()
+ except IOError, e:
+ log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+ return []
+
try:
try:
text = snippets.get((int(position),
if terms:
return JArray('object')(terms, Term)
- def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
+ def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
"""
Search for Tag objects using query.
"""
if not pdcounter:
- filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
- tops = self.searcher.search(query, filters, max_results)
+ filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
+ tops = self.searcher.search(query, filt, max_results)
tags = []
for found in tops.scoreDocs:
else:
tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
# don't add the pdcounter tag if same tag already exists
- if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
- tags.append(tag)
+
+ tags.append(tag)
+
except catalogue.models.Tag.DoesNotExist: pass
except PDCounterAuthor.DoesNotExist: pass
except PDCounterBook.DoesNotExist: pass
- # print "%s (%d) -> %f" % (tag, tag.id, found.score)
- print 'returning %s' % tags
+ log.debug('search_tags: %s' % tags)
+
return tags
- def search_books(self, query, filter=None, max_results=10):
+ def search_books(self, query, filt=None, max_results=10):
"""
Searches for Book objects using query
"""
bks = []
- tops = self.searcher.search(query, filter, max_results)
+ tops = self.searcher.search(query, filt, max_results)
for found in tops.scoreDocs:
doc = self.searcher.doc(found.doc)
try: