log = logging.getLogger('search')
+if os.path.isfile(settings.SOLR_STOPWORDS):
+ stopwords = set(
+ line.decode('utf-8').strip()
+ for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
+else:
+ stopwords = set()
+
class SolrIndex(object):
def __init__(self, mode=None):
'dramat_wierszowany_lp',
'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
'wywiad',
- ]
+ ]
ignore_content_tags = [
- 'uwaga', 'extra', 'nota_red',
+ 'uwaga', 'extra', 'nota_red', 'abstrakt',
'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
'didaskalia',
'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
- ]
+ ]
footnote_tags = ['pa', 'pt', 'pr', 'pe']
book_info = dcparser.parse(open(book.xml_file.path))
fields['slug'] = book.slug
- fields['tags'] = [t.name for t in book.tags]
fields['is_book'] = True
# validator, name
self._hits.append(hit)
+ @classmethod
+ def from_book(cls, book, how_found=None, query_terms=None):
+ doc = {
+ 'score': book.popularity.count,
+ 'book_id': book.id,
+ 'published_date': 0,
+ }
+ result = cls(doc, how_found=how_found, query_terms=query_terms)
+ result._book = book
+ return result
+
def __unicode__(self):
return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
(self.book_id, len(self._hits),
def merge(self, other):
if self.book_id != other.book_id:
- raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
+ raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
self._hits += other._hits
- if other.score > self.score:
- self._score = other._score
+ self._score += max(other._score, 0)
return self
def get_book(self):
return q
+ def search_by_author(self, words):
+ from catalogue.models import Book
+ books = Book.objects.filter(parent=None).order_by('-popularity__count')
+ for word in words:
+ books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
+ return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
+
def search_words(self, words, fields, book=True):
+ if book and fields == ['authors']:
+ return self.search_by_author(words)
filters = []
for word in words:
- word_filter = None
- for field in fields:
- q = self.index.Q(**{field: word})
- if word_filter is None:
- word_filter = q
- else:
- word_filter |= q
- filters.append(word_filter)
+ if book or (word not in stopwords):
+ word_filter = None
+ for field in fields:
+ q = self.index.Q(**{field: word})
+ if word_filter is None:
+ word_filter = q
+ else:
+ word_filter |= q
+ filters.append(word_filter)
+ if not filters:
+ return []
if book:
query = self.index.query(is_book=True)
else:
return snips
- def hint_tags(self, query, pdcounter=True, prefix=True):
- """
- Return auto-complete hints for tags
- using prefix search.
- """
- q = self.index.Q()
- query = query.strip()
- for field in ['tag_name', 'tag_name_pl']:
- if prefix:
- q |= self.index.Q(**{field: query + "*"})
- else:
- q |= self.make_term_query(query, field=field)
- qu = self.index.query(q)
-
- return self.search_tags(qu, pdcounter=pdcounter)
-
- def search_tags(self, query, filters=None, pdcounter=False):
- """
- Search for Tag objects using query.
- """
- if not filters:
- filters = []
- if not pdcounter:
- filters.append(~self.index.Q(is_pdcounter=True))
- res = self.apply_filters(query, filters).execute()
-
- tags = []
- pd_tags = []
-
- for doc in res:
- is_pdcounter = doc.get('is_pdcounter', False)
- category = doc.get('tag_category')
- try:
- if is_pdcounter:
- if category == 'pd_author':
- tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
- else: # category == 'pd_book':
- tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
- tag.category = 'pd_book' # make it look more lik a tag.
- pd_tags.append(tag)
- else:
- tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
- tags.append(tag)
-
- except catalogue.models.Tag.DoesNotExist:
- pass
- except PDCounterAuthor.DoesNotExist:
- pass
- except PDCounterBook.DoesNotExist:
- pass
-
- tags_slugs = set(map(lambda t: t.slug, tags))
- tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
-
- log.debug('search_tags: %s' % tags)
-
- return tags
-
- def hint_books(self, query, prefix=True):
- """
- Returns auto-complete hints for book titles
- Because we do not index 'pseudo' title-tags.
- Prefix search.
- """
- q = self.index.Q()
- query = query.strip()
- if prefix:
- q |= self.index.Q(title=query + "*")
- q |= self.index.Q(title_orig=query + "*")
- else:
- q |= self.make_term_query(query, field='title')
- q |= self.make_term_query(query, field='title_orig')
- qu = self.index.query(q)
- only_books = self.index.Q(is_book=True)
- return self.search_books(qu, [only_books])
-
- def search_books(self, query, filters=None, max_results=10):
- """
- Searches for Book objects using query
- """
- bks = []
- bks_found = set()
- query = query.query(is_book=True)
- res = self.apply_filters(query, filters).field_limit(['book_id'])
- for r in res:
- try:
- bid = r['book_id']
- if bid not in bks_found:
- bks.append(catalogue.models.Book.objects.get(id=bid))
- bks_found.add(bid)
- except catalogue.models.Book.DoesNotExist:
- pass
- return bks
-
@staticmethod
def apply_filters(query, filters):
"""