X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/2b110a246aad2a6a8c724abfb9a54ed53c466c39..86b624d26dbc858b0252200567b0db169df235c2:/apps/search/index.py?ds=sidebyside diff --git a/apps/search/index.py b/apps/search/index.py index 97145d340..b3e932285 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from django.conf import settings -from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \ +from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \ File, Field, Integer, \ NumericField, Version, Document, JavaError, IndexSearcher, \ QueryParser, PerFieldAnalyzerWrapper, \ @@ -27,7 +27,7 @@ from librarian import dcparser from librarian.parser import WLDocument from lxml import etree import catalogue.models -from pdcounter.models import Author as PDCounterAuthor +from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook from multiprocessing.pool import ThreadPool from threading import current_thread import atexit @@ -82,7 +82,7 @@ class IndexStore(object): """ def __init__(self): self.make_index_dir() - self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX)) + self.store = NIOFSDirectory(File(settings.SEARCH_INDEX)) def make_index_dir(self): try: @@ -169,11 +169,13 @@ class BaseIndex(IndexStore): analyzer = WLAnalyzer() self.analyzer = analyzer - def open(self, analyzer=None): + def open(self, timeout=None): if self.index: raise Exception("Index is already opened") - self.index = IndexWriter(self.store, self.analyzer,\ - IndexWriter.MaxFieldLength.LIMITED) + conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer) + if timeout: + conf.setWriteLockTimeout(long(timeout)) + self.index = IndexWriter(self.store, conf) return self.index def optimize(self): @@ -212,7 +214,7 @@ class Index(BaseIndex): q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True) self.index.deleteDocuments(q) - for tag in catalogue.models.Tag.objects.all(): + for tag in catalogue.models.Tag.objects.exclude(category='set'): doc = Document() doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) @@ -225,7 +227,16 @@ class Index(BaseIndex): doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id))) doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED)) + doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED)) + doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) + self.index.addDocument(doc) + + for pdtag in PDCounterBook.objects.all(): + doc = Document() + doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id))) + doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) self.index.addDocument(doc) @@ -262,7 +273,6 @@ class Index(BaseIndex): book_doc.add(elem) else: book_doc.add(f) - self.index.addDocument(book_doc) del book_doc @@ -553,12 +563,12 @@ class ReusableIndex(Index): """ index = None - def open(self, analyzer=None, threads=4): + def open(self, analyzer=None, **kw): if ReusableIndex.index: self.index = ReusableIndex.index else: print("opening index") - Index.open(self, analyzer) + Index.open(self, analyzer, **kw) ReusableIndex.index = self.index atexit.register(ReusableIndex.close_reusable) @@ -640,9 +650,10 @@ class SearchResult(object): self.book_id = int(stored.get("book_id")) pd = stored.get("published_date") - if pd is None: - pd = 0 - self.published_date = int(pd) + try: + self.published_date = int(pd) + except ValueError: + self.published_date = 0 header_type = stored.get("header_type") # we have a content hit in some header of fragment @@ -732,7 +743,7 @@ class SearchResult(object): for f in frags: try: - frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT]) + frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id) except catalogue.models.Fragment.DoesNotExist: # stale index continue @@ -1212,17 +1223,25 @@ class Search(IndexStore): if position is None or length is None: return None # locate content. - snippets = Snippets(stored.get('book_id')).open() + book_id = int(stored.get('book_id')) + snippets = Snippets(book_id).open() try: - text = snippets.get((int(position), - int(length))) - finally: - snippets.close() + try: + text = snippets.get((int(position), + int(length))) + finally: + snippets.close() - tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer) - # highlighter.getBestTextFragments(tokenStream, text, False, 10) - snip = highlighter.getBestFragments(tokenStream, text, 3, "...") + tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer) + # highlighter.getBestTextFragments(tokenStream, text, False, 10) + snip = highlighter.getBestFragments(tokenStream, text, 3, "...") + except Exception, e: + e2 = e + if hasattr(e, 'getJavaException'): + e2 = unicode(e.getJavaException()) + raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)), + e2) return snip @staticmethod @@ -1254,8 +1273,15 @@ class Search(IndexStore): for found in tops.scoreDocs: doc = self.searcher.doc(found.doc) is_pdcounter = doc.get('is_pdcounter') - if is_pdcounter: - tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) + category = doc.get('tag_category') + if is_pdcounter == 'true': + if category == 'pd_author': + tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) + elif category == 'pd_book': + tag = PDCounterBook.objects.get(id=doc.get('tag_id')) + tag.category = 'pd_book' # make it look more lik a tag. + else: + print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category) else: tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id")) # don't add the pdcounter tag if same tag already exists @@ -1302,7 +1328,7 @@ class Search(IndexStore): return only_term - def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True): + def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False): """ Return auto-complete hints for tags using prefix search. @@ -1314,14 +1340,14 @@ class Search(IndexStore): if prefix: q = self.make_prefix_phrase(toks, field) else: - q = self.make_term_query(toks, field) + q = self.make_term_query(toks, field, fuzzy=fuzzy) top.add(BooleanClause(q, BooleanClause.Occur.SHOULD)) no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True) return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter) - def hint_books(self, string, max_results=50, prefix=True): + def hint_books(self, string, max_results=50, prefix=True, fuzzy=False): """ Returns auto-complete hints for book titles Because we do not index 'pseudo' title-tags. @@ -1332,7 +1358,7 @@ class Search(IndexStore): if prefix: q = self.make_prefix_phrase(toks, 'title') else: - q = self.make_term_query(toks, 'title') + q = self.make_term_query(toks, 'title', fuzzy=fuzzy) return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)