From: Marcin Koziej Date: Thu, 5 Apr 2012 08:33:03 +0000 (+0200) Subject: optimizeindex removes documents for deleted books X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/f2e5eb8d76c57ee0198207e68a198d2b9cfd6fec?ds=inline;hp=-c optimizeindex removes documents for deleted books --- f2e5eb8d76c57ee0198207e68a198d2b9cfd6fec diff --git a/apps/search/index.py b/apps/search/index.py index e8b7a5ccc..6883978af 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -317,14 +317,19 @@ class Index(BaseIndex): doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id))) return doc - def remove_book(self, book, remove_snippets=True): + def remove_book(self, book_or_id, remove_snippets=True): """Removes a book from search index. book - Book instance.""" - q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True) + if isinstance(book_or_id, catalogue.models.Book): + book_id = book_or_id.id + else: + book_id = book_or_id + + q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True) self.index.deleteDocuments(q) if remove_snippets: - snippets = Snippets(book.id) + snippets = Snippets(book_id) snippets.remove() def index_book(self, book, book_info=None, overwrite=True): @@ -1060,6 +1065,7 @@ class Search(IndexStore): return toks + @staticmethod def fuzziness(self, fuzzy): """Helper method to sanitize fuzziness""" if not fuzzy: @@ -1097,6 +1103,7 @@ class Search(IndexStore): phrase.add(term) return phrase + @staticmethod def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False): """ Returns term queries joined by boolean query. diff --git a/apps/search/management/commands/optimizeindex.py b/apps/search/management/commands/optimizeindex.py index a8a4cf9dd..51bf95b4e 100644 --- a/apps/search/management/commands/optimizeindex.py +++ b/apps/search/management/commands/optimizeindex.py @@ -1,14 +1,38 @@ from django.core.management.base import BaseCommand -from search import Index +from search import Index, Search +from lucene import IndexReader, IndexSearcher, Term +from catalogue.models import Book + class Command(BaseCommand): help = 'Optimize Lucene search index' args = '' + def delete_old(self, index): + existing_ids = set([book.id for book in Book.objects.all()]) + + reader = IndexReader.open(index.index, False) + searcher = IndexSearcher(reader) + try: + num = searcher.docFreq(Term('is_book', 'true')) + docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num) + for result in docs.scoreDocs: + stored = searcher.doc(result.doc) + book_id = int(stored.get('book_id')) + if not book_id in existing_ids: + print "book id %d doesn't exist." % book_id + index.remove_book(book_id) + finally: + searcher.close() + reader.close() + def handle(self, *args, **opts): index = Index() index.open() + + self.delete_old(index) + try: index.optimize() finally: