limited stopwords + better search by author + remove descendant results
authorJan Szejko <janek37@gmail.com>
Tue, 20 Feb 2018 14:56:09 +0000 (15:56 +0100)
committerJan Szejko <janek37@gmail.com>
Tue, 20 Feb 2018 14:56:09 +0000 (15:56 +0100)
doc/schema.xml
src/search/index.py
src/search/views.py

index 56172d3..ccf9bd5 100644 (file)
    <field name="is_book" type="boolean" stored="false" indexed="true"/>
    <field name="authors" type="text_general" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true"/>
    <field name="translators" type="text_general" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
    <field name="is_book" type="boolean" stored="false" indexed="true"/>
    <field name="authors" type="text_general" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true"/>
    <field name="translators" type="text_general" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
-   <field name="title" type="text_pl" stored="false" indexed="true"/>
+   <field name="title" type="text_pl_nonstop" stored="false" indexed="true"/>
    <field name="title_orig" type="lowercase" stored="false" indexed="true"/>
 <!--   <field name="published_date" type="tdate" stored="false" indexed="true"/>-->
    <field name="published_date" type="string" stored="true" indexed="true"/>
    <field name="title_orig" type="lowercase" stored="false" indexed="true"/>
 <!--   <field name="published_date" type="tdate" stored="false" indexed="true"/>-->
    <field name="published_date" type="string" stored="true" indexed="true"/>
    <field name="kinds" type="lowercase" stored="false" indexed="false" multiValued="true" />
    <field name="genres" type="lowercase" stored="false" indexed="false" multiValued="true" />
 
    <field name="kinds" type="lowercase" stored="false" indexed="false" multiValued="true" />
    <field name="genres" type="lowercase" stored="false" indexed="false" multiValued="true" />
 
-   <field name="metadata" type="text_pl" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
+   <field name="metadata" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
 
    <field name="themes" type="lowercase" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
 
    <field name="themes" type="lowercase" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
-   <field name="themes_pl" type="text_pl" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
+   <field name="themes_pl" type="text_pl_nonstop" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
    <field name="header_index" type="int" stored="true" indexed="true"/>
    <field name="header_span" type="int" stored="true" indexed="true"/>
    <field name="header_type" type="lowercase" stored="true" indexed="false"/>
    <field name="header_index" type="int" stored="true" indexed="true"/>
    <field name="header_span" type="int" stored="true" indexed="true"/>
    <field name="header_type" type="lowercase" stored="true" indexed="false"/>
 
    <field name="tag_id" type="int" stored="true" indexed="true"/>
    <field name="tag_name" type="lowercase" stored="true" indexed="true" />
 
    <field name="tag_id" type="int" stored="true" indexed="true"/>
    <field name="tag_name" type="lowercase" stored="true" indexed="true" />
-   <field name="tag_name_pl" type="text_pl" stored="false" indexed="true" multiValued="true"/>
+   <field name="tag_name_pl" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true"/>
    <field name="tag_category" type="string" stored="true" indexed="true" />
    <field name="is_pdcounter" type="boolean" stored="true" indexed="true" />
 
    <field name="tag_category" type="string" stored="true" indexed="true" />
    <field name="is_pdcounter" type="boolean" stored="true" indexed="true" />
 
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball" enablePositionIncrements="true"/>
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball" enablePositionIncrements="true"/>
-       <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+        <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+        <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
+        <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
+        <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="text_pl_nonstop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
         <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
         <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
         <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
       </analyzer>
     </fieldType>
         <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
         <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
         <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
       </analyzer>
     </fieldType>
-    
     
     <!-- Portuguese -->
     <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
     
     <!-- Portuguese -->
     <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
index ab3286a..1026014 100644 (file)
@@ -558,6 +558,17 @@ class SearchResult(object):
 
             self._hits.append(hit)
 
 
             self._hits.append(hit)
 
+    @classmethod
+    def from_book(cls, book, how_found=None, query_terms=None):
+        doc = {
+            'score': book.popularity.count,
+            'book_id': book.id,
+            'published_date': 0,
+        }
+        result = cls(doc, how_found=how_found, query_terms=query_terms)
+        result._book = book
+        return result
+
     def __unicode__(self):
         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
             (self.book_id, len(self._hits),
     def __unicode__(self):
         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
             (self.book_id, len(self._hits),
@@ -575,7 +586,7 @@ class SearchResult(object):
         if self.book_id != other.book_id:
             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
         self._hits += other._hits
         if self.book_id != other.book_id:
             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
         self._hits += other._hits
-        self._score += max(other._score, 0) + 0.5
+        self._score += max(other._score, 0)
         return self
 
     def get_book(self):
         return self
 
     def get_book(self):
@@ -734,10 +745,19 @@ class Search(SolrIndex):
 
         return q
 
 
         return q
 
+    def search_by_author(self, words):
+        from catalogue.models import Book
+        books = Book.objects.filter(parent=None).order_by('-popularity__count')
+        for word in words:
+            books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
+        return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
+
     def search_words(self, words, fields, book=True):
     def search_words(self, words, fields, book=True):
+        if book and fields == ['authors']:
+            return self.search_by_author(words)
         filters = []
         for word in words:
         filters = []
         for word in words:
-            if word not in stopwords:
+            if book or (word not in stopwords):
                 word_filter = None
                 for field in fields:
                     q = self.index.Q(**{field: word})
                 word_filter = None
                 for field in fields:
                     q = self.index.Q(**{field: word})
index bda24b0..f7f6040 100644 (file)
@@ -142,8 +142,8 @@ def main(request):
         (['metadata'], True),
         (['text', 'themes_pl'], False),
     )
         (['metadata'], True),
         (['text', 'themes_pl'], False),
     )
-    for fieldset, is_book in fieldsets:
-        search_fields += fieldset
+    for fields, is_book in fieldsets:
+        search_fields += fields
         results_parts.append(search.search_words(words, search_fields, book=is_book))
 
     results = []
         results_parts.append(search.search_words(words, search_fields, book=is_book))
 
     results = []
@@ -157,6 +157,10 @@ def main(request):
                 results.append(result)
                 ids_results[book_id] = result
 
                 results.append(result)
                 ids_results[book_id] = result
 
+    descendant_ids = set(
+        Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
+    results = [result for result in results if result.book_id not in descendant_ids]
+
     for result in results:
         search.get_snippets(result, query, num=3)
 
     for result in results:
         search.get_snippets(result, query, num=3)