large boost for multiple results
[wolnelektury.git] / src / search / index.py
index 4f9a765..ab3286a 100644 (file)
@@ -20,6 +20,13 @@ from wolnelektury.utils import makedirs
 
 log = logging.getLogger('search')
 
+if os.path.isfile(settings.SOLR_STOPWORDS):
+    stopwords = set(
+        line.decode('utf-8').strip()
+        for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
+else:
+    stopwords = set()
+
 
 class SolrIndex(object):
     def __init__(self, mode=None):
@@ -271,14 +278,14 @@ class Index(SolrIndex):
         'dramat_wierszowany_lp',
         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
         'wywiad',
-        ]
+    ]
 
     ignore_content_tags = [
-        'uwaga', 'extra', 'nota_red',
+        'uwaga', 'extra', 'nota_red', 'abstrakt',
         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
         'didaskalia',
         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
-        ]
+    ]
 
     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 
@@ -566,10 +573,9 @@ class SearchResult(object):
 
     def merge(self, other):
         if self.book_id != other.book_id:
-            raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
+            raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
         self._hits += other._hits
-        if other.score > self.score:
-            self._score = other._score
+        self._score += max(other._score, 0) + 0.5
         return self
 
     def get_book(self):
@@ -670,7 +676,7 @@ class SearchResult(object):
             m.update(f[self.OTHER])
             hits.append(m)
 
-        hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
+        hits.sort(key=lambda h: h['score'], reverse=True)
 
         self._processed_hits = hits
 
@@ -731,20 +737,23 @@ class Search(SolrIndex):
     def search_words(self, words, fields, book=True):
         filters = []
         for word in words:
-            word_filter = None
-            for field in fields:
-                q = self.index.Q(**{field: word})
-                if word_filter is None:
-                    word_filter = q
-                else:
-                    word_filter |= q
-            filters.append(word_filter)
+            if word not in stopwords:
+                word_filter = None
+                for field in fields:
+                    q = self.index.Q(**{field: word})
+                    if word_filter is None:
+                        word_filter = q
+                    else:
+                        word_filter |= q
+                filters.append(word_filter)
+        if not filters:
+            return []
         if book:
             query = self.index.query(is_book=True)
         else:
             query = self.index.query()
         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
-        return [SearchResult(found, how_found='search_words') for found in query]
+        return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
 
     def get_snippets(self, searchresult, query, field='text', num=1):
         """
@@ -790,104 +799,6 @@ class Search(SolrIndex):
 
         return snips
 
-    def hint_tags(self, query, pdcounter=True, prefix=True):
-        """
-        Return auto-complete hints for tags
-        using prefix search.
-        """
-        q = self.index.Q()
-        query = query.strip()
-        for field in ['tag_name', 'tag_name_pl']:
-            if prefix:
-                q |= self.index.Q(**{field: query + "*"})
-            else:
-                q |= self.make_term_query(query, field=field)
-        qu = self.index.query(q)
-
-        return self.search_tags(qu, pdcounter=pdcounter)
-
-    def search_tags(self, query, filters=None, pdcounter=False):
-        """
-        Search for Tag objects using query.
-        """
-        if not filters:
-            filters = []
-        if not pdcounter:
-            filters.append(~self.index.Q(is_pdcounter=True))
-        res = self.apply_filters(query, filters).execute()
-
-        tags = []
-        pd_tags = []
-
-        for doc in res:
-            is_pdcounter = doc.get('is_pdcounter', False)
-            category = doc.get('tag_category')
-            try:
-                if is_pdcounter:
-                    if category == 'pd_author':
-                        tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
-                    elif category == 'pd_book':
-                        tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
-                        tag.category = 'pd_book'  # make it look more lik a tag.
-                    else:
-                        # WTF
-                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
-                            int(doc.get('tag_id')), category)).encode('utf-8')
-                    pd_tags.append(tag)
-                else:
-                    tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-                    tags.append(tag)
-
-            except catalogue.models.Tag.DoesNotExist:
-                pass
-            except PDCounterAuthor.DoesNotExist:
-                pass
-            except PDCounterBook.DoesNotExist:
-                pass
-
-        tags_slugs = set(map(lambda t: t.slug, tags))
-        tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
-
-        log.debug('search_tags: %s' % tags)
-
-        return tags
-
-    def hint_books(self, query, prefix=True):
-        """
-        Returns auto-complete hints for book titles
-        Because we do not index 'pseudo' title-tags.
-        Prefix search.
-        """
-        q = self.index.Q()
-        query = query.strip()
-        if prefix:
-            q |= self.index.Q(title=query + "*")
-            q |= self.index.Q(title_orig=query + "*")
-        else:
-            q |= self.make_term_query(query, field='title')
-            q |= self.make_term_query(query, field='title_orig')
-        qu = self.index.query(q)
-        only_books = self.index.Q(is_book=True)
-        return self.search_books(qu, [only_books])
-
-    def search_books(self, query, filters=None, max_results=10):
-        """
-        Searches for Book objects using query
-        """
-        bks = []
-        bks_found = set()
-        query = query.query(is_book=True)
-        res = self.apply_filters(query, filters).field_limit(['book_id'])
-        for r in res:
-            try:
-                bid = r['book_id']
-                if bid not in bks_found:
-                    bks.append(catalogue.models.Book.objects.get(id=bid))
-                    bks_found.add(bid)
-            except catalogue.models.Book.DoesNotExist:
-                pass
-        return bks
-
     @staticmethod
     def apply_filters(query, filters):
         """