X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/154870f0416b6b387637d6035c96321410512e95..94648e7ec0f755a99251bb47389eedf934aa5d3e:/src/search/index.py diff --git a/src/search/index.py b/src/search/index.py index a712b0702..68a2b3b18 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -1,7 +1,7 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from functools import total_ordering +from functools import reduce, total_ordering from itertools import chain import logging import operator @@ -9,6 +9,9 @@ import os import re from django.conf import settings from librarian import dcparser +import librarian.meta.types.date +import librarian.meta.types.person +import librarian.meta.types.text from librarian.parser import WLDocument from lxml import etree import scorched @@ -95,7 +98,10 @@ class Snippets(object): of the snippet stored there. """ self.file.seek(pos[0], 0) - txt = self.file.read(pos[1]).decode('utf-8') + try: + txt = self.file.read(pos[1]).decode('utf-8') + except: + return '' return txt def close(self): @@ -142,7 +148,8 @@ class Index(SolrIndex): uids.add(res['uid']) st += rows if uids: - self.index.delete(uids) + # FIXME: With Solr API change, this doesn't work. + #self.index.delete(uids) return True else: return False @@ -314,21 +321,20 @@ class Index(SolrIndex): if hasattr(book_info, field.name): if not getattr(book_info, field.name): continue - # since no type information is available, we use validator - type_indicator = field.validator - if type_indicator == dcparser.as_unicode: + type_indicator = field.value_type + if issubclass(type_indicator, librarian.meta.types.text.TextValue): s = getattr(book_info, field.name) if field.multiple: s = ', '.join(s) fields[field.name] = s - elif type_indicator == dcparser.as_person: + elif issubclass(type_indicator, librarian.meta.types.person.Person): p = getattr(book_info, field.name) - if isinstance(p, dcparser.Person): + if isinstance(p, librarian.meta.types.person.Person): persons = str(p) else: persons = ', '.join(map(str, p)) fields[field.name] = persons - elif type_indicator == dcparser.as_date: + elif issubclass(type_indicator, librarian.meta.types.date.DateValue): dt = getattr(book_info, field.name) fields[field.name] = dt @@ -389,16 +395,16 @@ class Index(SolrIndex): return def fix_format(text): - # separator = [u" ", u"\t", u".", u";", u","] + # separator = [" ", "\t", ".", ";", ","] if isinstance(text, list): # need to join it first text = filter(lambda s: s is not None, content) - text = u' '.join(text) + text = ' '.join(text) # for i in range(len(text)): # if i > 0: # if text[i][0] not in separator\ # and text[i - 1][-1] not in separator: - # text.insert(i, u" ") + # text.insert(i, " ") return re.sub("(?m)/$", "", text) @@ -462,7 +468,7 @@ class Index(SolrIndex): elif end is not None and footnote is not [] and end.tag in self.footnote_tags: handle_text.pop() doc = add_part(snippets, header_index=position, header_type=header.tag, - text=u''.join(footnote), + text=''.join(footnote), is_footnote=True) self.index.add(doc) footnote = [] @@ -612,7 +618,7 @@ class SearchResult(object): return result def __str__(self): - return u"" % \ + return "" % \ (self.book_id, len(self._hits), len(self._processed_hits) if self._processed_hits else -1, self._score, len(self.snippets)) @@ -634,7 +640,10 @@ class SearchResult(object): def get_book(self): if self._book is not None: return self._book - self._book = catalogue.models.Book.objects.get(id=self.book_id) + try: + self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True) + except catalogue.models.Book.DoesNotExist: + self._book = None return self._book book = property(get_book) @@ -661,20 +670,18 @@ class SearchResult(object): lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect) - def remove_duplicates(lst, keyfn, compare): + def remove_duplicates(lst, keyfn, larger): els = {} for e in lst: eif = keyfn(e) if eif in els: - if compare(els[eif], e) >= 1: + if larger(els[eif], e): continue els[eif] = e return els.values() # remove fragments with duplicated fid's and duplicated snippets - frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE])) - # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT], - # lambda a, b: cmp(a[SCORE], b[SCORE])) + frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE]) # remove duplicate sections sections = {} @@ -713,12 +720,12 @@ class SearchResult(object): break def theme_by_name(n): - th = filter(lambda t: t.name == n, themes) + th = list(filter(lambda t: t.name == n, themes)) if th: return th[0] else: return None - themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit)) + themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit))) m = {'score': f[self.SCORE], 'fragment': frag, @@ -746,13 +753,17 @@ class SearchResult(object): books[r.book_id] = r return books.values() + def get_sort_key(self): + return (-self.score, + self.published_date, + self.book.sort_key_author if self.book else '', + self.book.sort_key if self.book else '') + def __lt__(self, other): - return (-self.score, self.published_date, self.book.sort_key_author, self.book.sort_key) > \ - (-other.score, other.published_date, other.book.sort_key_author, other.book.sort_key) + return self.get_sort_key() > other.get_sort_key() def __eq__(self, other): - return (self.score, self.published_date, self.book.sort_key_author, self.book.sort_key) == \ - (other.score, other.published_date, other.book.sort_key_author, other.book.sort_key) + return self.get_sort_key() == other.get_sort_key() def __len__(self): return len(self.hits) @@ -794,7 +805,7 @@ class PictureResult(object): self._hits.append(hit) def __str__(self): - return u"" % (self.picture_id, self._score) + return "" % (self.picture_id, self._score) def __repr__(self): return str(self) @@ -897,7 +908,7 @@ class Search(SolrIndex): def search_by_author(self, words): from catalogue.models import Book - books = Book.objects.filter(parent=None).order_by('-popularity__count') + books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count') for word in words: books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count') return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]] @@ -971,7 +982,7 @@ class Search(SolrIndex): idx += 1 except IOError as e: - book = catalogue.models.Book.objects.filter(id=book_id) + book = catalogue.models.Book.objects.filter(id=book_id, findable=True) if not book: log.error("Book does not exist for book id = %d" % book_id) elif not book.get().children.exists():