limited stopwords + better search by author + remove descendant results

author Jan Szejko <janek37@gmail.com>

Tue, 20 Feb 2018 14:56:09 +0000 (15:56 +0100)

committer Jan Szejko <janek37@gmail.com>

Tue, 20 Feb 2018 14:56:09 +0000 (15:56 +0100)
author Jan Szejko <janek37@gmail.com>
Tue, 20 Feb 2018 14:56:09 +0000 (15:56 +0100)
committer Jan Szejko <janek37@gmail.com>
Tue, 20 Feb 2018 14:56:09 +0000 (15:56 +0100)
diff --git a/doc/schema.xml b/doc/schema.xml

index 56172d3..ccf9bd5 100644 (file)
--- a/doc/schema.xml
+++ b/doc/schema.xml
@@ -134,7 +134,7 @@
     <field name="is_book" type="boolean" stored="false" indexed="true"/>
     <field name="authors" type="text_general" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true"/>
     <field name="translators" type="text_general" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
-   <field name="title" type="text_pl" stored="false" indexed="true"/>
+   <field name="title" type="text_pl_nonstop" stored="false" indexed="true"/>
     <field name="title_orig" type="lowercase" stored="false" indexed="true"/>
  <!--   <field name="published_date" type="tdate" stored="false" indexed="true"/>-->
     <field name="published_date" type="string" stored="true" indexed="true"/>
@@ -143,10 +143,10 @@
     <field name="kinds" type="lowercase" stored="false" indexed="false" multiValued="true" />
     <field name="genres" type="lowercase" stored="false" indexed="false" multiValued="true" />
  
-   <field name="metadata" type="text_pl" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
+   <field name="metadata" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
  
     <field name="themes" type="lowercase" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
-   <field name="themes_pl" type="text_pl" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
+   <field name="themes_pl" type="text_pl_nonstop" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
     <field name="header_index" type="int" stored="true" indexed="true"/>
     <field name="header_span" type="int" stored="true" indexed="true"/>
     <field name="header_type" type="lowercase" stored="true" indexed="false"/>
@@ -159,7 +159,7 @@
  
     <field name="tag_id" type="int" stored="true" indexed="true"/>
     <field name="tag_name" type="lowercase" stored="true" indexed="true" />
-   <field name="tag_name_pl" type="text_pl" stored="false" indexed="true" multiValued="true"/>
+   <field name="tag_name_pl" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true"/>
     <field name="tag_category" type="string" stored="true" indexed="true" />
     <field name="is_pdcounter" type="boolean" stored="true" indexed="true" />
  
@@ -1073,13 +1073,23 @@
          <tokenizer class="solr.StandardTokenizerFactory"/>
          <filter class="solr.LowerCaseFilterFactory"/>
          <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball" enablePositionIncrements="true"/>
-       <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+        <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+        <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
+        <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
+        <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_pl_nonstop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
          <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
          <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
          <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
        </analyzer>
      </fieldType>
-    
      
      <!-- Portuguese -->
      <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
diff --git a/src/search/index.py b/src/search/index.py

index ab3286a..1026014 100644 (file)
--- a/src/search/index.py
+++ b/src/search/index.py
@@ -558,6 +558,17 @@ class SearchResult(object):
  
              self._hits.append(hit)
  
+    @classmethod
+    def from_book(cls, book, how_found=None, query_terms=None):
+        doc = {
+            'score': book.popularity.count,
+            'book_id': book.id,
+            'published_date': 0,
+        }
+        result = cls(doc, how_found=how_found, query_terms=query_terms)
+        result._book = book
+        return result
+
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
              (self.book_id, len(self._hits),
@@ -575,7 +586,7 @@ class SearchResult(object):
          if self.book_id != other.book_id:
              raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
          self._hits += other._hits
-        self._score += max(other._score, 0) + 0.5
+        self._score += max(other._score, 0)
          return self
  
      def get_book(self):
@@ -734,10 +745,19 @@ class Search(SolrIndex):
  
          return q
  
+    def search_by_author(self, words):
+        from catalogue.models import Book
+        books = Book.objects.filter(parent=None).order_by('-popularity__count')
+        for word in words:
+            books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
+        return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
+
      def search_words(self, words, fields, book=True):
+        if book and fields == ['authors']:
+            return self.search_by_author(words)
          filters = []
          for word in words:
-            if word not in stopwords:
+            if book or (word not in stopwords):
                  word_filter = None
                  for field in fields:
                      q = self.index.Q(**{field: word})
diff --git a/src/search/views.py b/src/search/views.py

index bda24b0..f7f6040 100644 (file)
--- a/src/search/views.py
+++ b/src/search/views.py
@@ -142,8 +142,8 @@ def main(request):
          (['metadata'], True),
          (['text', 'themes_pl'], False),
      )
-    for fieldset, is_book in fieldsets:
-        search_fields += fieldset
+    for fields, is_book in fieldsets:
+        search_fields += fields
          results_parts.append(search.search_words(words, search_fields, book=is_book))
  
      results = []
@@ -157,6 +157,10 @@ def main(request):
                  results.append(result)
                  ids_results[book_id] = result
  
+    descendant_ids = set(
+        Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
+    results = [result for result in results if result.book_id not in descendant_ids]
+
      for result in results:
          search.get_snippets(result, query, num=3)
author	Jan Szejko <janek37@gmail.com>
	Tue, 20 Feb 2018 14:56:09 +0000 (15:56 +0100)
committer	Jan Szejko <janek37@gmail.com>
	Tue, 20 Feb 2018 14:56:09 +0000 (15:56 +0100)
doc/schema.xml		patch \| blob \| history
src/search/index.py		patch \| blob \| history
src/search/views.py		patch \| blob \| history