bump librarian to master

[wolnelektury.git] / apps / search / index.py
diff --git a/apps/search/index.py b/apps/search/index.py

index 9d6d598..a0bf715 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -18,7 +18,7 @@ from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader,
      # KeywordAnalyzer
  
  # Initialize jvm
      # KeywordAnalyzer
  
  # Initialize jvm
-JVM = initVM(CLASSPATH)
+JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP)
  
  import sys
  import os
  
  import sys
  import os
@@ -31,9 +31,11 @@ import catalogue.models
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from multiprocessing.pool import ThreadPool
  from threading import current_thread
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from multiprocessing.pool import ThreadPool
  from threading import current_thread
+from itertools import chain
  import atexit
  import traceback
  import atexit
  import traceback
-
+import logging
+log = logging.getLogger('search')
  
  class WLAnalyzer(PerFieldAnalyzerWrapper):
      def __init__(self):
  
  class WLAnalyzer(PerFieldAnalyzerWrapper):
      def __init__(self):
@@ -147,7 +149,6 @@ class Snippets(object):
                      if not os.path.exists(self.path):
                          break
                      self.revision += 1
                      if not os.path.exists(self.path):
                          break
                      self.revision += 1
-            print "using %s" % self.path
  
          self.file = open(self.path, mode)
          self.position = 0
  
          self.file = open(self.path, mode)
          self.position = 0
@@ -218,7 +219,7 @@ class BaseIndex(IndexStore):
          try:
              self.index.optimize()
          except JavaError, je:
          try:
              self.index.optimize()
          except JavaError, je:
-            print "Error during optimize phase, check index: %s" % je
+            log.error("Error during optimize phase, check index: %s" % je)
  
          self.index.close()
          self.index = None
  
          self.index.close()
          self.index = None
@@ -277,9 +278,9 @@ class Index(BaseIndex):
          if not remove_only:
              # then add them [all or just one passed]
              if not tags:
          if not remove_only:
              # then add them [all or just one passed]
              if not tags:
-                tags = catalogue.models.Tag.objects.exclude(category='set') + \
-                    PDCounterAuthor.objects.all() + \
-                    PDCounterBook.objects.all()
+                tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
+                    PDCounterAuthor.objects.all(), \
+                    PDCounterBook.objects.all())
  
              for tag in tags:
                  if isinstance(tag, PDCounterAuthor):
  
              for tag in tags:
                  if isinstance(tag, PDCounterAuthor):
@@ -316,14 +317,19 @@ class Index(BaseIndex):
              doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
          return doc
  
              doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
          return doc
  
-    def remove_book(self, book, remove_snippets=True):
+    def remove_book(self, book_or_id, remove_snippets=True):
          """Removes a book from search index.
          book - Book instance."""
          """Removes a book from search index.
          book - Book instance."""
-        q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
+        if isinstance(book_or_id, catalogue.models.Book):
+            book_id = book_or_id.id
+        else:
+            book_id = book_or_id
+
+        q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
          self.index.deleteDocuments(q)
  
          if remove_snippets:
          self.index.deleteDocuments(q)
  
          if remove_snippets:
-            snippets = Snippets(book.id)
+            snippets = Snippets(book_id)
              snippets.remove()
  
      def index_book(self, book, book_info=None, overwrite=True):
              snippets.remove()
  
      def index_book(self, book, book_info=None, overwrite=True):
@@ -338,7 +344,11 @@ class Index(BaseIndex):
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info)
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+        # let's not index it - it's only used for extracting publish date
+        if 'source_name' in meta_fields:
+            del meta_fields['source_name']
+        
          for f in meta_fields.values():
              if isinstance(f, list) or isinstance(f, tuple):
                  for elem in f:
          for f in meta_fields.values():
              if isinstance(f, list) or isinstance(f, tuple):
                  for elem in f:
@@ -372,7 +382,7 @@ class Index(BaseIndex):
  
      published_date_re = re.compile("([0-9]+)[\]. ]*$")
  
  
      published_date_re = re.compile("([0-9]+)[\]. ]*$")
  
-    def extract_metadata(self, book, book_info=None):
+    def extract_metadata(self, book, book_info=None, dc_only=None):
          """
          Extract metadata from book and returns a map of fields keyed by fieldname
          """
          """
          Extract metadata from book and returns a map of fields keyed by fieldname
          """
@@ -387,6 +397,8 @@ class Index(BaseIndex):
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
  
          # validator, name
          for field in dcparser.BookInfo.FIELDS:
+            if dc_only and field.name not in dc_only:
+                continue
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
@@ -492,8 +504,6 @@ class Index(BaseIndex):
                      .setIntValue('header_span' in fields and fields['header_span'] or 1))
              doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
  
                      .setIntValue('header_span' in fields and fields['header_span'] or 1))
              doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
  
-            print ">>[%s]>%s<<<" % (fields.get('fragment_anchor', ''), fields['content'])
-
              doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
                            Field.TermVector.WITH_POSITIONS_OFFSETS))
  
              doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
                            Field.TermVector.WITH_POSITIONS_OFFSETS))
  
@@ -623,7 +633,7 @@ def log_exception_wrapper(f):
          try:
              f(*a)
          except Exception, e:
          try:
              f(*a)
          except Exception, e:
-            print("Error in indexing thread: %s" % e)
+            log.error("Error in indexing thread: %s" % e)
              traceback.print_exc()
              raise e
      return _wrap
              traceback.print_exc()
              raise e
      return _wrap
@@ -643,7 +653,6 @@ class ReusableIndex(Index):
          if ReusableIndex.index:
              self.index = ReusableIndex.index
          else:
          if ReusableIndex.index:
              self.index = ReusableIndex.index
          else:
-            print("opening index")
              Index.open(self, analyzer, **kw)
              ReusableIndex.index = self.index
              atexit.register(ReusableIndex.close_reusable)
              Index.open(self, analyzer, **kw)
              ReusableIndex.index = self.index
              atexit.register(ReusableIndex.close_reusable)
@@ -655,7 +664,6 @@ class ReusableIndex(Index):
      @staticmethod
      def close_reusable():
          if ReusableIndex.index:
      @staticmethod
      def close_reusable():
          if ReusableIndex.index:
-            print("closing index")
              ReusableIndex.index.optimize()
              ReusableIndex.index.close()
              ReusableIndex.index = None
              ReusableIndex.index.optimize()
              ReusableIndex.index.close()
              ReusableIndex.index = None
@@ -808,7 +816,7 @@ class SearchResult(object):
  
          # remove fragments with duplicated fid's and duplicated snippets
          frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
  
          # remove fragments with duplicated fid's and duplicated snippets
          frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
-        frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or hash(f),
+        frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
                                    lambda a, b: cmp(a[SCORE], b[SCORE]))
  
          # remove duplicate sections
                                    lambda a, b: cmp(a[SCORE], b[SCORE]))
  
          # remove duplicate sections
@@ -874,7 +882,6 @@ class SearchResult(object):
              for r in rl:
                  if r.book_id in books:
                      books[r.book_id].merge(r)
              for r in rl:
                  if r.book_id in books:
                      books[r.book_id].merge(r)
-                    #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
                  else:
                      books[r.book_id] = r
          return books.values()
                  else:
                      books[r.book_id] = r
          return books.values()
@@ -1010,9 +1017,8 @@ class Search(IndexStore):
      def reopen(self, **unused):
          reader = self.searcher.getIndexReader()
          rdr = reader.reopen()
      def reopen(self, **unused):
          reader = self.searcher.getIndexReader()
          rdr = reader.reopen()
-        print "got signal to reopen index"
          if not rdr.equals(reader):
          if not rdr.equals(reader):
-            print "will reopen index"
+            log.debug('Reopening index')
              oldsearch = self.searcher
              self.searcher = IndexSearcher(rdr)
              oldsearch.close()
              oldsearch = self.searcher
              self.searcher = IndexSearcher(rdr)
              oldsearch.close()
@@ -1060,7 +1066,8 @@ class Search(IndexStore):
  
          return toks
  
  
          return toks
  
-    def fuzziness(self, fuzzy):
+    @staticmethod
+    def fuzziness(fuzzy):
          """Helper method to sanitize fuzziness"""
          if not fuzzy:
              return None
          """Helper method to sanitize fuzziness"""
          if not fuzzy:
              return None
@@ -1081,7 +1088,6 @@ class Search(IndexStore):
                  fuzzterms = []
  
                  while True:
                  fuzzterms = []
  
                  while True:
-                    #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
                      ft = fuzzterm.term()
                      if ft:
                          fuzzterms.append(ft)
                      ft = fuzzterm.term()
                      if ft:
                          fuzzterms.append(ft)
@@ -1098,7 +1104,8 @@ class Search(IndexStore):
                  phrase.add(term)
          return phrase
  
                  phrase.add(term)
          return phrase
  
-    def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+    @staticmethod
+    def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
@@ -1252,7 +1259,6 @@ class Search(IndexStore):
          topDocs = self.searcher.search(q, only_in, max_results)
          for found in topDocs.scoreDocs:
              books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
          topDocs = self.searcher.search(q, only_in, max_results)
          for found in topDocs.scoreDocs:
              books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
-            print "* %s theme x content: %s" % (searched, books[-1]._hits)
  
          # query themes/content x author/title/tags
          q = BooleanQuery()
  
          # query themes/content x author/title/tags
          q = BooleanQuery()
@@ -1271,7 +1277,6 @@ class Search(IndexStore):
          topDocs = self.searcher.search(q, only_in, max_results)
          for found in topDocs.scoreDocs:
              books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
          topDocs = self.searcher.search(q, only_in, max_results)
          for found in topDocs.scoreDocs:
              books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
-            print "* %s scatter search: %s" % (searched, books[-1]._hits)
  
          return books
  
  
          return books
  
@@ -1332,9 +1337,17 @@ class Search(IndexStore):
              return None
          revision = stored.get('snippets_revision')
          if revision: revision = int(revision)
              return None
          revision = stored.get('snippets_revision')
          if revision: revision = int(revision)
+
          # locate content.
          book_id = int(stored.get('book_id'))
          # locate content.
          book_id = int(stored.get('book_id'))
-        snippets = Snippets(book_id, revision=revision).open()
+        snippets = Snippets(book_id, revision=revision)
+
+        try:
+            snippets.open()
+        except IOError, e:
+            log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+            return []
+
          try:
              try:
                  text = snippets.get((int(position),
          try:
              try:
                  text = snippets.get((int(position),
@@ -1371,13 +1384,13 @@ class Search(IndexStore):
          if terms:
              return JArray('object')(terms, Term)
  
          if terms:
              return JArray('object')(terms, Term)
  
-    def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
+    def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
          """
          Search for Tag objects using query.
          """
          if not pdcounter:
          """
          Search for Tag objects using query.
          """
          if not pdcounter:
-            filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
-        tops = self.searcher.search(query, filters, max_results)
+            filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
+        tops = self.searcher.search(query, filt, max_results)
  
          tags = []
          for found in tops.scoreDocs:
  
          tags = []
          for found in tops.scoreDocs:
@@ -1396,22 +1409,23 @@ class Search(IndexStore):
                  else:
                      tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
                      # don't add the pdcounter tag if same tag already exists
                  else:
                      tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
                      # don't add the pdcounter tag if same tag already exists
-                if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
-                    tags.append(tag)
+
+                tags.append(tag)
+
              except catalogue.models.Tag.DoesNotExist: pass
              except PDCounterAuthor.DoesNotExist: pass
              except PDCounterBook.DoesNotExist: pass
  
              except catalogue.models.Tag.DoesNotExist: pass
              except PDCounterAuthor.DoesNotExist: pass
              except PDCounterBook.DoesNotExist: pass
  
-                #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
-        print 'returning %s' % tags
+        log.debug('search_tags: %s' % tags)
+
          return tags
  
          return tags
  
-    def search_books(self, query, filter=None, max_results=10):
+    def search_books(self, query, filt=None, max_results=10):
          """
          Searches for Book objects using query
          """
          bks = []
          """
          Searches for Book objects using query
          """
          bks = []
-        tops = self.searcher.search(query, filter, max_results)
+        tops = self.searcher.search(query, filt, max_results)
          for found in tops.scoreDocs:
              doc = self.searcher.doc(found.doc)
              try:
          for found in tops.scoreDocs:
              doc = self.searcher.doc(found.doc)
              try: