optimizeindex removes documents for deleted books
authorMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Thu, 5 Apr 2012 08:33:03 +0000 (10:33 +0200)
committerMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Thu, 5 Apr 2012 08:33:03 +0000 (10:33 +0200)
apps/search/index.py
apps/search/management/commands/optimizeindex.py

index e8b7a5c..6883978 100644 (file)
@@ -317,14 +317,19 @@ class Index(BaseIndex):
             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
         return doc
 
-    def remove_book(self, book, remove_snippets=True):
+    def remove_book(self, book_or_id, remove_snippets=True):
         """Removes a book from search index.
         book - Book instance."""
-        q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
+        if isinstance(book_or_id, catalogue.models.Book):
+            book_id = book_or_id.id
+        else:
+            book_id = book_or_id
+
+        q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
         self.index.deleteDocuments(q)
 
         if remove_snippets:
-            snippets = Snippets(book.id)
+            snippets = Snippets(book_id)
             snippets.remove()
 
     def index_book(self, book, book_info=None, overwrite=True):
@@ -1060,6 +1065,7 @@ class Search(IndexStore):
 
         return toks
 
+    @staticmethod
     def fuzziness(self, fuzzy):
         """Helper method to sanitize fuzziness"""
         if not fuzzy:
@@ -1097,6 +1103,7 @@ class Search(IndexStore):
                 phrase.add(term)
         return phrase
 
+    @staticmethod
     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
         """
         Returns term queries joined by boolean query.
index a8a4cf9..51bf95b 100644 (file)
@@ -1,14 +1,38 @@
 
 from django.core.management.base import BaseCommand
-from search import Index
+from search import Index, Search
+from lucene import IndexReader, IndexSearcher, Term
+from catalogue.models import Book
+
 
 class Command(BaseCommand):
     help = 'Optimize Lucene search index'
     args = ''
 
+    def delete_old(self, index):
+        existing_ids = set([book.id for book in Book.objects.all()])
+
+        reader = IndexReader.open(index.index, False)
+        searcher = IndexSearcher(reader)
+        try:
+            num = searcher.docFreq(Term('is_book', 'true'))
+            docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num)
+            for result in docs.scoreDocs:
+                stored = searcher.doc(result.doc)
+                book_id = int(stored.get('book_id'))
+                if not book_id in existing_ids:
+                    print "book id %d doesn't exist." % book_id
+                    index.remove_book(book_id)
+        finally:
+            searcher.close()
+            reader.close()
+
     def handle(self, *args, **opts):
         index = Index()
         index.open()
+
+        self.delete_old(index)
+
         try:
             index.optimize()
         finally: