X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/cad95a5f21346628d1dffa3b50ffa8f38baa5972..d048787bd42b9d009aa7b3dfb5c08f7682f231e6:/src/search/index.py diff --git a/src/search/index.py b/src/search/index.py index a1c2716a8..bd31a2acf 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -1,29 +1,29 @@ -# -*- coding: utf-8 -*- # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from django.conf import settings - +from functools import reduce, total_ordering +from itertools import chain +import logging +import operator import os import re +from django.conf import settings from librarian import dcparser from librarian.parser import WLDocument from lxml import etree +import scorched import catalogue.models import picture.models from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook -from itertools import chain -import sunburnt -import custom -import operator -import logging from wolnelektury.utils import makedirs +from . import custom log = logging.getLogger('search') + if os.path.isfile(settings.SOLR_STOPWORDS): stopwords = set( - line.decode('utf-8').strip() + line.strip() for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#')) else: stopwords = set() @@ -129,7 +129,7 @@ class Index(SolrIndex): """ uids = set() for q in queries: - if isinstance(q, sunburnt.search.LuceneQuery): + if isinstance(q, scorched.search.LuceneQuery): q = self.index.query(q) q.field_limiter.update(['uid']) st = 0 @@ -142,7 +142,8 @@ class Index(SolrIndex): uids.add(res['uid']) st += rows if uids: - self.index.delete(uids) + # FIXME: With Solr API change, this doesn't work. + #self.index.delete(uids) return True else: return False @@ -324,9 +325,9 @@ class Index(SolrIndex): elif type_indicator == dcparser.as_person: p = getattr(book_info, field.name) if isinstance(p, dcparser.Person): - persons = unicode(p) + persons = str(p) else: - persons = ', '.join(map(unicode, p)) + persons = ', '.join(map(str, p)) fields[field.name] = persons elif type_indicator == dcparser.as_date: dt = getattr(book_info, field.name) @@ -478,7 +479,7 @@ class Index(SolrIndex): fid = start.attrib['id'][1:] handle_text.append(lambda text: None) if start.text is not None: - fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(',')))) + fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(',')))) elif end is not None and end.tag == 'motyw': handle_text.pop() @@ -557,6 +558,7 @@ class Index(SolrIndex): self.index.add(doc) +@total_ordering class SearchResult(object): def __init__(self, doc, how_found=None, query_terms=None): self.boost = 1.0 @@ -610,14 +612,14 @@ class SearchResult(object): result._book = book return result - def __unicode__(self): + def __str__(self): return u"" % \ (self.book_id, len(self._hits), len(self._processed_hits) if self._processed_hits else -1, self._score, len(self.snippets)) - def __str__(self): - return unicode(self).encode('utf-8') + def __bytes__(self): + return str(self).encode('utf-8') @property def score(self): @@ -633,7 +635,10 @@ class SearchResult(object): def get_book(self): if self._book is not None: return self._book - self._book = catalogue.models.Book.objects.get(id=self.book_id) + try: + self._book = catalogue.models.Book.objects.get(id=self.book_id) + except catalogue.models.Book.DoesNotExist: + self._book = None return self._book book = property(get_book) @@ -653,12 +658,12 @@ class SearchResult(object): # to sections and fragments frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits) - sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits) + sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None] # sections not covered by fragments - sect = filter(lambda s: 0 == len(filter( + sect = filter(lambda s: 0 == len(list(filter( lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] < - f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect) + f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect) def remove_duplicates(lst, keyfn, compare): els = {} @@ -691,7 +696,7 @@ class SearchResult(object): m.update(s[self.OTHER]) sections[si] = m - hits = sections.values() + hits = list(sections.values()) for f in frags: try: @@ -705,19 +710,19 @@ class SearchResult(object): if self.query_terms is not None: for i in range(0, len(f[self.OTHER]['themes'])): tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ') - tms = map(unicode.lower, tms) + tms = map(str.lower, tms) for qt in self.query_terms: if qt in tms: themes_hit.add(f[self.OTHER]['themes'][i]) break def theme_by_name(n): - th = filter(lambda t: t.name == n, themes) + th = list(filter(lambda t: t.name == n, themes)) if th: return th[0] else: return None - themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit)) + themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit))) m = {'score': f[self.SCORE], 'fragment': frag, @@ -745,13 +750,17 @@ class SearchResult(object): books[r.book_id] = r return books.values() - def __cmp__(self, other): - c = cmp(self.score, other.score) - if c == 0: - # this is inverted, because earlier date is better - return cmp(other.published_date, self.published_date) - else: - return c + def get_sort_key(self): + return (-self.score, + self.published_date, + self.book.sort_key_author if self.book else '', + self.book.sort_key if self.book else '') + + def __lt__(self, other): + return self.get_sort_key() > other.get_sort_key() + + def __eq__(self, other): + return self.get_sort_key() == other.get_sort_key() def __len__(self): return len(self.hits) @@ -766,6 +775,7 @@ class SearchResult(object): return None +@total_ordering class PictureResult(object): def __init__(self, doc, how_found=None, query_terms=None): self.boost = 1.0 @@ -791,11 +801,11 @@ class PictureResult(object): self._hits.append(hit) - def __unicode__(self): + def __str__(self): return u"" % (self.picture_id, self._score) def __repr__(self): - return unicode(self) + return str(self) @property def score(self): @@ -829,7 +839,7 @@ class PictureResult(object): if self.query_terms is not None: for i in range(0, len(hit[self.OTHER]['themes'])): tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ') - tms = map(unicode.lower, tms) + tms = map(str.lower, tms) for qt in self.query_terms: if qt in tms: themes_hit.add(hit[self.OTHER]['themes'][i]) @@ -866,8 +876,11 @@ class PictureResult(object): books[r.picture_id] = r return books.values() - def __cmp__(self, other): - return cmp(self.score, other.score) + def __lt__(self, other): + return self.score < other.score + + def __eq__(self, other): + return self.score == other.score class Search(SolrIndex): @@ -957,13 +970,15 @@ class Search(SolrIndex): text = snippets.get((int(position), int(length))) snip = self.index.highlight(text=text, field=field, q=query) + if not snip and field == 'text': + snip = self.index.highlight(text=text, field='text_nonstem', q=query) if snip not in snips: snips[idx] = snip if snip: num -= 1 idx += 1 - except IOError, e: + except IOError as e: book = catalogue.models.Book.objects.filter(id=book_id) if not book: log.error("Book does not exist for book id = %d" % book_id) @@ -973,8 +988,8 @@ class Search(SolrIndex): finally: snippets.close() - # remove verse end markers.. - snips = map(lambda s: s and s.replace("/\n", "\n"), snips) + # remove verse end markers.. + snips = [s.replace("/\n", "\n") if s else s for s in snips] searchresult.snippets = snips