Merge branch 'lucene_memory'

author Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>

Wed, 11 Apr 2012 09:24:59 +0000 (11:24 +0200)

committer Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>

Wed, 11 Apr 2012 09:24:59 +0000 (11:24 +0200)
author Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Wed, 11 Apr 2012 09:24:59 +0000 (11:24 +0200)
committer Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Wed, 11 Apr 2012 09:24:59 +0000 (11:24 +0200)
diff --git a/apps/search/index.py b/apps/search/index.py

index 4e71e25..b689c76 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -317,14 +317,19 @@ class Index(BaseIndex):
              doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
          return doc
  
-    def remove_book(self, book, remove_snippets=True):
+    def remove_book(self, book_or_id, remove_snippets=True):
          """Removes a book from search index.
          book - Book instance."""
-        q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
+        if isinstance(book_or_id, catalogue.models.Book):
+            book_id = book_or_id.id
+        else:
+            book_id = book_or_id
+
+        q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
          self.index.deleteDocuments(q)
  
          if remove_snippets:
-            snippets = Snippets(book.id)
+            snippets = Snippets(book_id)
              snippets.remove()
  
      def index_book(self, book, book_info=None, overwrite=True):
@@ -339,7 +344,11 @@ class Index(BaseIndex):
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info)
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+        # let's not index it - it's only used for extracting publish date
+        if 'source_name' in meta_fields:
+            del meta_fields['source_name']
+        
          for f in meta_fields.values():
              if isinstance(f, list) or isinstance(f, tuple):
                  for elem in f:
@@ -373,7 +382,7 @@ class Index(BaseIndex):
  
      published_date_re = re.compile("([0-9]+)[\]. ]*$")
  
-    def extract_metadata(self, book, book_info=None):
+    def extract_metadata(self, book, book_info=None, dc_only=None):
          """
          Extract metadata from book and returns a map of fields keyed by fieldname
          """
@@ -388,6 +397,8 @@ class Index(BaseIndex):
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
+            if dc_only and field.name not in dc_only:
+                continue
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
@@ -1055,7 +1066,8 @@ class Search(IndexStore):
  
          return toks
  
-    def fuzziness(self, fuzzy):
+    @staticmethod
+    def fuzziness(fuzzy):
          """Helper method to sanitize fuzziness"""
          if not fuzzy:
              return None
@@ -1092,7 +1104,8 @@ class Search(IndexStore):
                  phrase.add(term)
          return phrase
  
-    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+    @staticmethod
+    def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
diff --git a/apps/search/management/commands/optimizeindex.py b/apps/search/management/commands/optimizeindex.py

index a8a4cf9..51bf95b 100644 (file)
--- a/apps/search/management/commands/optimizeindex.py
+++ b/apps/search/management/commands/optimizeindex.py
@@ -1,14 +1,38 @@
  
  from django.core.management.base import BaseCommand
-from search import Index
+from search import Index, Search
+from lucene import IndexReader, IndexSearcher, Term
+from catalogue.models import Book
+
  
  class Command(BaseCommand):
      help = 'Optimize Lucene search index'
      args = ''
  
+    def delete_old(self, index):
+        existing_ids = set([book.id for book in Book.objects.all()])
+
+        reader = IndexReader.open(index.index, False)
+        searcher = IndexSearcher(reader)
+        try:
+            num = searcher.docFreq(Term('is_book', 'true'))
+            docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num)
+            for result in docs.scoreDocs:
+                stored = searcher.doc(result.doc)
+                book_id = int(stored.get('book_id'))
+                if not book_id in existing_ids:
+                    print "book id %d doesn't exist." % book_id
+                    index.remove_book(book_id)
+        finally:
+            searcher.close()
+            reader.close()
+
      def handle(self, *args, **opts):
          index = Index()
          index.open()
+
+        self.delete_old(index)
+
          try:
              index.optimize()
          finally:
diff --git a/apps/search/views.py b/apps/search/views.py

index 09f217f..fd5883e 100644 (file)
--- a/apps/search/views.py
+++ b/apps/search/views.py
@@ -14,6 +14,7 @@ from catalogue.views import JSONResponse
  from search import Search, JVM, SearchResult
  from lucene import StringReader
  from suggest.forms import PublishingSuggestForm
+from time import sleep
  import re
  import enchant
  
@@ -50,8 +51,21 @@ def did_you_mean(query, tokens):
  
      return query
  
+
  JVM.attachCurrentThread()
-search = Search()
+_search = None
+
+
+def get_search():
+    global _search
+
+    while _search is False:
+        sleep(1)
+
+    if _search is None:
+        _search = False
+        _search = Search()
+    return _search
  
  
  def hint(request):
@@ -60,6 +74,7 @@ def hint(request):
          return JSONResponse([])
      JVM.attachCurrentThread()
  
+    search = get_search()
      hint = search.hint()
      try:
          tags = request.GET.get('tags', '')
@@ -117,6 +132,7 @@ def main(request):
          return render_to_response('catalogue/search_too_short.html', {'prefix': query},
                                    context_instance=RequestContext(request))
  
+    search = get_search()
      # hint.tags(tag_list)
      # if book:
      #     hint.books(book)
diff --git a/wolnelektury/settings/__init__.py b/wolnelektury/settings/__init__.py

index f2314a9..15126b9 100644 (file)
--- a/wolnelektury/settings/__init__.py
+++ b/wolnelektury/settings/__init__.py
@@ -63,6 +63,7 @@ INSTALLED_APPS_OUR = [
      'picture',
      'social',
      'waiter',
+    'search',
      ]
  
  INSTALLED_APPS_CONTRIB = [
author	Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
	Wed, 11 Apr 2012 09:24:59 +0000 (11:24 +0200)
committer	Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
	Wed, 11 Apr 2012 09:24:59 +0000 (11:24 +0200)
apps/search/index.py		patch \| blob \| history
apps/search/management/commands/optimizeindex.py		patch \| blob \| history
apps/search/views.py		patch \| blob \| history
wolnelektury/settings/__init__.py		patch \| blob \| history