ignore stopwords in query
authorJan Szejko <janek37@gmail.com>
Mon, 19 Feb 2018 11:39:11 +0000 (12:39 +0100)
committerJan Szejko <janek37@gmail.com>
Mon, 19 Feb 2018 11:39:11 +0000 (12:39 +0100)
src/search/index.py
src/wolnelektury/settings/basic.py

index 4c278ea..5cae3e3 100644 (file)
@@ -20,6 +20,13 @@ from wolnelektury.utils import makedirs
 
 log = logging.getLogger('search')
 
+if os.path.isfile(settings.SOLR_STOPWORDS):
+    stopwords = set(
+        line.decode('utf-8').strip()
+        for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
+else:
+    stopwords = set()
+
 
 class SolrIndex(object):
     def __init__(self, mode=None):
@@ -731,14 +738,17 @@ class Search(SolrIndex):
     def search_words(self, words, fields, book=True):
         filters = []
         for word in words:
-            word_filter = None
-            for field in fields:
-                q = self.index.Q(**{field: word})
-                if word_filter is None:
-                    word_filter = q
-                else:
-                    word_filter |= q
-            filters.append(word_filter)
+            if word not in stopwords:
+                word_filter = None
+                for field in fields:
+                    q = self.index.Q(**{field: word})
+                    if word_filter is None:
+                        word_filter = q
+                    else:
+                        word_filter |= q
+                filters.append(word_filter)
+        if not filters:
+            return []
         if book:
             query = self.index.query(is_book=True)
         else:
index c9939b5..32b48f2 100644 (file)
@@ -26,6 +26,7 @@ DATABASES = {
 
 SOLR = "http://localhost:8983/solr/wl/"
 SOLR_TEST = "http://localhost:8983/solr/wl_test/"
+SOLR_STOPWORDS = "/path/to/solr/data/conf/lang/stopwords_pl.txt"
 
 # Local time zone for this installation. Choices can be found here:
 # http://en.wikipedia.org/wiki/List_of_tz_zones_by_name