<field name="is_book" type="boolean" stored="false" indexed="true"/>
<field name="authors" type="text_general" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true"/>
<field name="translators" type="text_general" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
- <field name="title" type="text_pl" stored="false" indexed="true"/>
+ <field name="title" type="text_pl_nonstop" stored="false" indexed="true"/>
<field name="title_orig" type="lowercase" stored="false" indexed="true"/>
<!-- <field name="published_date" type="tdate" stored="false" indexed="true"/>-->
<field name="published_date" type="string" stored="true" indexed="true"/>
<field name="kinds" type="lowercase" stored="false" indexed="false" multiValued="true" />
<field name="genres" type="lowercase" stored="false" indexed="false" multiValued="true" />
- <field name="metadata" type="text_pl" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
+ <field name="metadata" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" />
<field name="themes" type="lowercase" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
- <field name="themes_pl" type="text_pl" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
+ <field name="themes_pl" type="text_pl_nonstop" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
<field name="header_index" type="int" stored="true" indexed="true"/>
<field name="header_span" type="int" stored="true" indexed="true"/>
<field name="header_type" type="lowercase" stored="true" indexed="false"/>
<field name="tag_id" type="int" stored="true" indexed="true"/>
<field name="tag_name" type="lowercase" stored="true" indexed="true" />
- <field name="tag_name_pl" type="text_pl" stored="false" indexed="true" multiValued="true"/>
+ <field name="tag_name_pl" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true"/>
<field name="tag_category" type="string" stored="true" indexed="true" />
<field name="is_pdcounter" type="boolean" stored="true" indexed="true" />
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball" enablePositionIncrements="true"/>
- <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+ <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+ <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
+ <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
+ <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_pl_nonstop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
<!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
</analyzer>
</fieldType>
-
<!-- Portuguese -->
<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
self._hits.append(hit)
+ @classmethod
+ def from_book(cls, book, how_found=None, query_terms=None):
+ doc = {
+ 'score': book.popularity.count,
+ 'book_id': book.id,
+ 'published_date': 0,
+ }
+ result = cls(doc, how_found=how_found, query_terms=query_terms)
+ result._book = book
+ return result
+
def __unicode__(self):
return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
(self.book_id, len(self._hits),
if self.book_id != other.book_id:
raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
self._hits += other._hits
- self._score += max(other._score, 0) + 0.5
+ self._score += max(other._score, 0)
return self
def get_book(self):
return q
+ def search_by_author(self, words):
+ from catalogue.models import Book
+ books = Book.objects.filter(parent=None).order_by('-popularity__count')
+ for word in words:
+ books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
+ return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
+
def search_words(self, words, fields, book=True):
+ if book and fields == ['authors']:
+ return self.search_by_author(words)
filters = []
for word in words:
- if word not in stopwords:
+ if book or (word not in stopwords):
word_filter = None
for field in fields:
q = self.index.Q(**{field: word})
(['metadata'], True),
(['text', 'themes_pl'], False),
)
- for fieldset, is_book in fieldsets:
- search_fields += fieldset
+ for fields, is_book in fieldsets:
+ search_fields += fields
results_parts.append(search.search_words(words, search_fields, book=is_book))
results = []
results.append(result)
ids_results[book_id] = result
+ descendant_ids = set(
+ Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
+ results = [result for result in results if result.book_id not in descendant_ids]
+
for result in results:
search.get_snippets(result, query, num=3)