X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/154870f0416b6b387637d6035c96321410512e95..0a7090f11131631647db366ff87976407e788412:/src/search/index.py diff --git a/src/search/index.py b/src/search/index.py index a712b0702..9784d49a7 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -1,7 +1,7 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from functools import total_ordering +from functools import reduce, total_ordering from itertools import chain import logging import operator @@ -9,6 +9,9 @@ import os import re from django.conf import settings from librarian import dcparser +import librarian.meta.types.date +import librarian.meta.types.person +import librarian.meta.types.text from librarian.parser import WLDocument from lxml import etree import scorched @@ -95,7 +98,10 @@ class Snippets(object): of the snippet stored there. """ self.file.seek(pos[0], 0) - txt = self.file.read(pos[1]).decode('utf-8') + try: + txt = self.file.read(pos[1]).decode('utf-8') + except: + return '' return txt def close(self): @@ -122,6 +128,22 @@ class Index(SolrIndex): def __init__(self): super(Index, self).__init__(mode='rw') + def remove_snippets(self, book): + book.snippet_set.all().delete() + + def add_snippet(self, book, doc): + assert book.id == doc.pop('book_id') + # Fragments already exist and can be indexed where they live. + if 'fragment_anchor' in doc: + return + + text = doc.pop('text') + header_index = doc.pop('header_index') + book.snippet_set.create( + sec=header_index, + text=text, + ) + def delete_query(self, *queries): """ index.delete(queries=...) doesn't work, so let's reimplement it @@ -142,7 +164,8 @@ class Index(SolrIndex): uids.add(res['uid']) st += rows if uids: - self.index.delete(uids) + # FIXME: With Solr API change, this doesn't work. + #self.index.delete(uids) return True else: return False @@ -222,19 +245,15 @@ class Index(SolrIndex): doc['parent_id'] = int(book.parent.id) return doc - def remove_book(self, book_or_id, remove_snippets=True): + def remove_book(self, book, remove_snippets=True): """Removes a book from search index. book - Book instance.""" - if isinstance(book_or_id, catalogue.models.Book): - book_id = book_or_id.id - else: - book_id = book_or_id - - self.delete_query(self.index.Q(book_id=book_id)) + self.delete_query(self.index.Q(book_id=book.id)) if remove_snippets: - snippets = Snippets(book_id) + snippets = Snippets(book.id) snippets.remove() + self.remove_snippets(book) def index_book(self, book, book_info=None, overwrite=True): """ @@ -242,6 +261,8 @@ class Index(SolrIndex): Creates a lucene document for extracted metadata and calls self.index_content() to index the contents of the book. """ + if not book.xml_file: return + if overwrite: # we don't remove snippets, since they might be still needed by # threads using not reopened index @@ -302,7 +323,7 @@ class Index(SolrIndex): fields = {} if book_info is None: - book_info = dcparser.parse(open(book.xml_file.path)) + book_info = dcparser.parse(open(book.xml_file.path, 'rb')) fields['slug'] = book.slug fields['is_book'] = True @@ -314,21 +335,20 @@ class Index(SolrIndex): if hasattr(book_info, field.name): if not getattr(book_info, field.name): continue - # since no type information is available, we use validator - type_indicator = field.validator - if type_indicator == dcparser.as_unicode: + type_indicator = field.value_type + if issubclass(type_indicator, librarian.meta.types.text.TextValue): s = getattr(book_info, field.name) if field.multiple: s = ', '.join(s) fields[field.name] = s - elif type_indicator == dcparser.as_person: + elif issubclass(type_indicator, librarian.meta.types.person.Person): p = getattr(book_info, field.name) - if isinstance(p, dcparser.Person): + if isinstance(p, librarian.meta.types.person.Person): persons = str(p) else: persons = ', '.join(map(str, p)) fields[field.name] = persons - elif type_indicator == dcparser.as_date: + elif issubclass(type_indicator, librarian.meta.types.date.DateValue): dt = getattr(book_info, field.name) fields[field.name] = dt @@ -389,16 +409,16 @@ class Index(SolrIndex): return def fix_format(text): - # separator = [u" ", u"\t", u".", u";", u","] + # separator = [" ", "\t", ".", ";", ","] if isinstance(text, list): # need to join it first text = filter(lambda s: s is not None, content) - text = u' '.join(text) + text = ' '.join(text) # for i in range(len(text)): # if i > 0: # if text[i][0] not in separator\ # and text[i - 1][-1] not in separator: - # text.insert(i, u" ") + # text.insert(i, " ") return re.sub("(?m)/$", "", text) @@ -462,8 +482,8 @@ class Index(SolrIndex): elif end is not None and footnote is not [] and end.tag in self.footnote_tags: handle_text.pop() doc = add_part(snippets, header_index=position, header_type=header.tag, - text=u''.join(footnote), - is_footnote=True) + text=''.join(footnote)) + self.add_snippet(book, doc) self.index.add(doc) footnote = [] @@ -498,6 +518,8 @@ class Index(SolrIndex): fragment_anchor=fid, text=fix_format(frag['text']), themes=frag['themes']) + # Add searchable fragment + self.add_snippet(book, doc) self.index.add(doc) # Collect content. @@ -510,6 +532,7 @@ class Index(SolrIndex): doc = add_part(snippets, header_index=position, header_type=header.tag, text=fix_format(content)) + self.add_snippet(book, doc) self.index.add(doc) finally: @@ -612,7 +635,7 @@ class SearchResult(object): return result def __str__(self): - return u"" % \ + return "" % \ (self.book_id, len(self._hits), len(self._processed_hits) if self._processed_hits else -1, self._score, len(self.snippets)) @@ -634,7 +657,10 @@ class SearchResult(object): def get_book(self): if self._book is not None: return self._book - self._book = catalogue.models.Book.objects.get(id=self.book_id) + try: + self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True) + except catalogue.models.Book.DoesNotExist: + self._book = None return self._book book = property(get_book) @@ -661,20 +687,18 @@ class SearchResult(object): lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect) - def remove_duplicates(lst, keyfn, compare): + def remove_duplicates(lst, keyfn, larger): els = {} for e in lst: eif = keyfn(e) if eif in els: - if compare(els[eif], e) >= 1: + if larger(els[eif], e): continue els[eif] = e return els.values() # remove fragments with duplicated fid's and duplicated snippets - frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE])) - # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT], - # lambda a, b: cmp(a[SCORE], b[SCORE])) + frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE]) # remove duplicate sections sections = {} @@ -713,12 +737,12 @@ class SearchResult(object): break def theme_by_name(n): - th = filter(lambda t: t.name == n, themes) + th = list(filter(lambda t: t.name == n, themes)) if th: return th[0] else: return None - themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit)) + themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit))) m = {'score': f[self.SCORE], 'fragment': frag, @@ -746,13 +770,17 @@ class SearchResult(object): books[r.book_id] = r return books.values() + def get_sort_key(self): + return (-self.score, + self.published_date, + self.book.sort_key_author if self.book else '', + self.book.sort_key if self.book else '') + def __lt__(self, other): - return (-self.score, self.published_date, self.book.sort_key_author, self.book.sort_key) > \ - (-other.score, other.published_date, other.book.sort_key_author, other.book.sort_key) + return self.get_sort_key() > other.get_sort_key() def __eq__(self, other): - return (self.score, self.published_date, self.book.sort_key_author, self.book.sort_key) == \ - (other.score, other.published_date, other.book.sort_key_author, other.book.sort_key) + return self.get_sort_key() == other.get_sort_key() def __len__(self): return len(self.hits) @@ -794,7 +822,7 @@ class PictureResult(object): self._hits.append(hit) def __str__(self): - return u"" % (self.picture_id, self._score) + return "" % (self.picture_id, self._score) def __repr__(self): return str(self) @@ -897,7 +925,7 @@ class Search(SolrIndex): def search_by_author(self, words): from catalogue.models import Book - books = Book.objects.filter(parent=None).order_by('-popularity__count') + books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count') for word in words: books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count') return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]] @@ -971,7 +999,7 @@ class Search(SolrIndex): idx += 1 except IOError as e: - book = catalogue.models.Book.objects.filter(id=book_id) + book = catalogue.models.Book.objects.filter(id=book_id, findable=True) if not book: log.error("Book does not exist for book id = %d" % book_id) elif not book.get().children.exists():