X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/dbb6eb3883a5f5e371f4bf7c89e74326feca0fd1..f729c572910e51749bb2464761038b6eceaf79c6:/apps/search/index.py diff --git a/apps/search/index.py b/apps/search/index.py index 9d6d59861..a0bf71588 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -18,7 +18,7 @@ from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, # KeywordAnalyzer # Initialize jvm -JVM = initVM(CLASSPATH) +JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP) import sys import os @@ -31,9 +31,11 @@ import catalogue.models from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook from multiprocessing.pool import ThreadPool from threading import current_thread +from itertools import chain import atexit import traceback - +import logging +log = logging.getLogger('search') class WLAnalyzer(PerFieldAnalyzerWrapper): def __init__(self): @@ -147,7 +149,6 @@ class Snippets(object): if not os.path.exists(self.path): break self.revision += 1 - print "using %s" % self.path self.file = open(self.path, mode) self.position = 0 @@ -218,7 +219,7 @@ class BaseIndex(IndexStore): try: self.index.optimize() except JavaError, je: - print "Error during optimize phase, check index: %s" % je + log.error("Error during optimize phase, check index: %s" % je) self.index.close() self.index = None @@ -277,9 +278,9 @@ class Index(BaseIndex): if not remove_only: # then add them [all or just one passed] if not tags: - tags = catalogue.models.Tag.objects.exclude(category='set') + \ - PDCounterAuthor.objects.all() + \ - PDCounterBook.objects.all() + tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \ + PDCounterAuthor.objects.all(), \ + PDCounterBook.objects.all()) for tag in tags: if isinstance(tag, PDCounterAuthor): @@ -316,14 +317,19 @@ class Index(BaseIndex): doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id))) return doc - def remove_book(self, book, remove_snippets=True): + def remove_book(self, book_or_id, remove_snippets=True): """Removes a book from search index. book - Book instance.""" - q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True) + if isinstance(book_or_id, catalogue.models.Book): + book_id = book_or_id.id + else: + book_id = book_or_id + + q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True) self.index.deleteDocuments(q) if remove_snippets: - snippets = Snippets(book.id) + snippets = Snippets(book_id) snippets.remove() def index_book(self, book, book_info=None, overwrite=True): @@ -338,7 +344,11 @@ class Index(BaseIndex): self.remove_book(book, remove_snippets=False) book_doc = self.create_book_doc(book) - meta_fields = self.extract_metadata(book, book_info) + meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title']) + # let's not index it - it's only used for extracting publish date + if 'source_name' in meta_fields: + del meta_fields['source_name'] + for f in meta_fields.values(): if isinstance(f, list) or isinstance(f, tuple): for elem in f: @@ -372,7 +382,7 @@ class Index(BaseIndex): published_date_re = re.compile("([0-9]+)[\]. ]*$") - def extract_metadata(self, book, book_info=None): + def extract_metadata(self, book, book_info=None, dc_only=None): """ Extract metadata from book and returns a map of fields keyed by fieldname """ @@ -387,6 +397,8 @@ class Index(BaseIndex): # validator, name for field in dcparser.BookInfo.FIELDS: + if dc_only and field.name not in dc_only: + continue if hasattr(book_info, field.name): if not getattr(book_info, field.name): continue @@ -492,8 +504,6 @@ class Index(BaseIndex): .setIntValue('header_span' in fields and fields['header_span'] or 1)) doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED)) - print ">>[%s]>%s<<<" % (fields.get('fragment_anchor', ''), fields['content']) - doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \ Field.TermVector.WITH_POSITIONS_OFFSETS)) @@ -623,7 +633,7 @@ def log_exception_wrapper(f): try: f(*a) except Exception, e: - print("Error in indexing thread: %s" % e) + log.error("Error in indexing thread: %s" % e) traceback.print_exc() raise e return _wrap @@ -643,7 +653,6 @@ class ReusableIndex(Index): if ReusableIndex.index: self.index = ReusableIndex.index else: - print("opening index") Index.open(self, analyzer, **kw) ReusableIndex.index = self.index atexit.register(ReusableIndex.close_reusable) @@ -655,7 +664,6 @@ class ReusableIndex(Index): @staticmethod def close_reusable(): if ReusableIndex.index: - print("closing index") ReusableIndex.index.optimize() ReusableIndex.index.close() ReusableIndex.index = None @@ -808,7 +816,7 @@ class SearchResult(object): # remove fragments with duplicated fid's and duplicated snippets frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE])) - frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or hash(f), + frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE])) # remove duplicate sections @@ -874,7 +882,6 @@ class SearchResult(object): for r in rl: if r.book_id in books: books[r.book_id].merge(r) - #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score)) else: books[r.book_id] = r return books.values() @@ -1010,9 +1017,8 @@ class Search(IndexStore): def reopen(self, **unused): reader = self.searcher.getIndexReader() rdr = reader.reopen() - print "got signal to reopen index" if not rdr.equals(reader): - print "will reopen index" + log.debug('Reopening index') oldsearch = self.searcher self.searcher = IndexSearcher(rdr) oldsearch.close() @@ -1060,7 +1066,8 @@ class Search(IndexStore): return toks - def fuzziness(self, fuzzy): + @staticmethod + def fuzziness(fuzzy): """Helper method to sanitize fuzziness""" if not fuzzy: return None @@ -1081,7 +1088,6 @@ class Search(IndexStore): fuzzterms = [] while True: - # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8')) ft = fuzzterm.term() if ft: fuzzterms.append(ft) @@ -1098,7 +1104,8 @@ class Search(IndexStore): phrase.add(term) return phrase - def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False): + @staticmethod + def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False): """ Returns term queries joined by boolean query. modal - applies to boolean query @@ -1252,7 +1259,6 @@ class Search(IndexStore): topDocs = self.searcher.search(q, only_in, max_results) for found in topDocs.scoreDocs: books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched)) - print "* %s theme x content: %s" % (searched, books[-1]._hits) # query themes/content x author/title/tags q = BooleanQuery() @@ -1271,7 +1277,6 @@ class Search(IndexStore): topDocs = self.searcher.search(q, only_in, max_results) for found in topDocs.scoreDocs: books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched)) - print "* %s scatter search: %s" % (searched, books[-1]._hits) return books @@ -1332,9 +1337,17 @@ class Search(IndexStore): return None revision = stored.get('snippets_revision') if revision: revision = int(revision) + # locate content. book_id = int(stored.get('book_id')) - snippets = Snippets(book_id, revision=revision).open() + snippets = Snippets(book_id, revision=revision) + + try: + snippets.open() + except IOError, e: + log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e)) + return [] + try: try: text = snippets.get((int(position), @@ -1371,13 +1384,13 @@ class Search(IndexStore): if terms: return JArray('object')(terms, Term) - def search_tags(self, query, filters=None, max_results=40, pdcounter=False): + def search_tags(self, query, filt=None, max_results=40, pdcounter=False): """ Search for Tag objects using query. """ if not pdcounter: - filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)]) - tops = self.searcher.search(query, filters, max_results) + filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)]) + tops = self.searcher.search(query, filt, max_results) tags = [] for found in tops.scoreDocs: @@ -1396,22 +1409,23 @@ class Search(IndexStore): else: tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id")) # don't add the pdcounter tag if same tag already exists - if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)): - tags.append(tag) + + tags.append(tag) + except catalogue.models.Tag.DoesNotExist: pass except PDCounterAuthor.DoesNotExist: pass except PDCounterBook.DoesNotExist: pass - # print "%s (%d) -> %f" % (tag, tag.id, found.score) - print 'returning %s' % tags + log.debug('search_tags: %s' % tags) + return tags - def search_books(self, query, filter=None, max_results=10): + def search_books(self, query, filt=None, max_results=10): """ Searches for Book objects using query """ bks = [] - tops = self.searcher.search(query, filter, max_results) + tops = self.searcher.search(query, filt, max_results) for found in tops.scoreDocs: doc = self.searcher.doc(found.doc) try: