X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/38f324dec64ba8adffcc795095023557b8a7a39c..ad9d29909733d402b8c197cb7c5d46afe887fa15:/apps/search/index.py diff --git a/apps/search/index.py b/apps/search/index.py index 8ea31240e..b3e932285 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from django.conf import settings -from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \ +from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \ File, Field, Integer, \ NumericField, Version, Document, JavaError, IndexSearcher, \ QueryParser, PerFieldAnalyzerWrapper, \ @@ -27,7 +27,7 @@ from librarian import dcparser from librarian.parser import WLDocument from lxml import etree import catalogue.models -from pdcounter.models import Author as PDCounterAuthor +from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook from multiprocessing.pool import ThreadPool from threading import current_thread import atexit @@ -82,7 +82,7 @@ class IndexStore(object): """ def __init__(self): self.make_index_dir() - self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX)) + self.store = NIOFSDirectory(File(settings.SEARCH_INDEX)) def make_index_dir(self): try: @@ -169,11 +169,13 @@ class BaseIndex(IndexStore): analyzer = WLAnalyzer() self.analyzer = analyzer - def open(self, analyzer=None): + def open(self, timeout=None): if self.index: raise Exception("Index is already opened") - self.index = IndexWriter(self.store, self.analyzer,\ - IndexWriter.MaxFieldLength.LIMITED) + conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer) + if timeout: + conf.setWriteLockTimeout(long(timeout)) + self.index = IndexWriter(self.store, conf) return self.index def optimize(self): @@ -212,7 +214,7 @@ class Index(BaseIndex): q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True) self.index.deleteDocuments(q) - for tag in catalogue.models.Tag.objects.all(): + for tag in catalogue.models.Tag.objects.exclude(category='set'): doc = Document() doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) @@ -225,7 +227,16 @@ class Index(BaseIndex): doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id))) doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED)) + doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED)) + doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) + self.index.addDocument(doc) + + for pdtag in PDCounterBook.objects.all(): + doc = Document() + doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id))) + doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) self.index.addDocument(doc) @@ -262,7 +273,6 @@ class Index(BaseIndex): book_doc.add(elem) else: book_doc.add(f) - self.index.addDocument(book_doc) del book_doc @@ -553,12 +563,12 @@ class ReusableIndex(Index): """ index = None - def open(self, analyzer=None, threads=4): + def open(self, analyzer=None, **kw): if ReusableIndex.index: self.index = ReusableIndex.index else: print("opening index") - Index.open(self, analyzer) + Index.open(self, analyzer, **kw) ReusableIndex.index = self.index atexit.register(ReusableIndex.close_reusable) @@ -640,9 +650,10 @@ class SearchResult(object): self.book_id = int(stored.get("book_id")) pd = stored.get("published_date") - if pd is None: - pd = 0 - self.published_date = int(pd) + try: + self.published_date = int(pd) + except ValueError: + self.published_date = 0 header_type = stored.get("header_type") # we have a content hit in some header of fragment @@ -732,7 +743,7 @@ class SearchResult(object): for f in frags: try: - frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT]) + frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id) except catalogue.models.Fragment.DoesNotExist: # stale index continue @@ -1262,8 +1273,15 @@ class Search(IndexStore): for found in tops.scoreDocs: doc = self.searcher.doc(found.doc) is_pdcounter = doc.get('is_pdcounter') - if is_pdcounter: - tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) + category = doc.get('tag_category') + if is_pdcounter == 'true': + if category == 'pd_author': + tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) + elif category == 'pd_book': + tag = PDCounterBook.objects.get(id=doc.get('tag_id')) + tag.category = 'pd_book' # make it look more lik a tag. + else: + print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category) else: tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id")) # don't add the pdcounter tag if same tag already exists