X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/cad95a5f21346628d1dffa3b50ffa8f38baa5972..6f12caf7d625f0bfffd2b138897fd8c711a6b2f5:/src/search/index.py?ds=sidebyside diff --git a/src/search/index.py b/src/search/index.py index a1c2716a8..22c9a02ae 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -1,29 +1,32 @@ -# -*- coding: utf-8 -*- # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from django.conf import settings - +from functools import reduce, total_ordering +from itertools import chain +import logging +import operator import os import re +from django.conf import settings from librarian import dcparser +import librarian.meta.types.date +import librarian.meta.types.person +import librarian.meta.types.text from librarian.parser import WLDocument from lxml import etree +import scorched import catalogue.models import picture.models from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook -from itertools import chain -import sunburnt -import custom -import operator -import logging from wolnelektury.utils import makedirs +from . import custom log = logging.getLogger('search') + if os.path.isfile(settings.SOLR_STOPWORDS): stopwords = set( - line.decode('utf-8').strip() + line.strip() for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#')) else: stopwords = set() @@ -95,7 +98,10 @@ class Snippets(object): of the snippet stored there. """ self.file.seek(pos[0], 0) - txt = self.file.read(pos[1]).decode('utf-8') + try: + txt = self.file.read(pos[1]).decode('utf-8') + except: + return '' return txt def close(self): @@ -122,6 +128,22 @@ class Index(SolrIndex): def __init__(self): super(Index, self).__init__(mode='rw') + def remove_snippets(self, book): + book.snippet_set.all().delete() + + def add_snippet(self, book, doc): + assert book.id == doc.pop('book_id') + # Fragments already exist and can be indexed where they live. + if 'fragment_anchor' in doc: + return + + text = doc.pop('text') + header_index = doc.pop('header_index') + book.snippet_set.create( + sec=header_index, + text=text, + ) + def delete_query(self, *queries): """ index.delete(queries=...) doesn't work, so let's reimplement it @@ -129,7 +151,7 @@ class Index(SolrIndex): """ uids = set() for q in queries: - if isinstance(q, sunburnt.search.LuceneQuery): + if isinstance(q, scorched.search.LuceneQuery): q = self.index.query(q) q.field_limiter.update(['uid']) st = 0 @@ -142,7 +164,8 @@ class Index(SolrIndex): uids.add(res['uid']) st += rows if uids: - self.index.delete(uids) + # FIXME: With Solr API change, this doesn't work. + #self.index.delete(uids) return True else: return False @@ -222,30 +245,29 @@ class Index(SolrIndex): doc['parent_id'] = int(book.parent.id) return doc - def remove_book(self, book_or_id, remove_snippets=True): + def remove_book(self, book, remove_snippets=True, legacy=True): """Removes a book from search index. book - Book instance.""" - if isinstance(book_or_id, catalogue.models.Book): - book_id = book_or_id.id - else: - book_id = book_or_id + if legacy: + self.delete_query(self.index.Q(book_id=book.id)) - self.delete_query(self.index.Q(book_id=book_id)) - - if remove_snippets: - snippets = Snippets(book_id) + if remove_snippets: + snippets = Snippets(book.id) snippets.remove() + self.remove_snippets(book) - def index_book(self, book, book_info=None, overwrite=True): + def index_book(self, book, book_info=None, overwrite=True, legacy=True): """ Indexes the book. Creates a lucene document for extracted metadata and calls self.index_content() to index the contents of the book. """ + if not book.xml_file: return + if overwrite: # we don't remove snippets, since they might be still needed by # threads using not reopened index - self.remove_book(book, remove_snippets=False) + self.remove_book(book, remove_snippets=False, legacy=legacy) book_doc = self.create_book_doc(book) meta_fields = self.extract_metadata(book, book_info, dc_only=[ @@ -258,7 +280,8 @@ class Index(SolrIndex): book_doc[n] = f book_doc['uid'] = "book%s" % book_doc['book_id'] - self.index.add(book_doc) + if legacy: + self.index.add(book_doc) del book_doc book_fields = { 'title': meta_fields['title'], @@ -270,7 +293,7 @@ class Index(SolrIndex): if tag_name in meta_fields: book_fields[tag_name] = meta_fields[tag_name] - self.index_content(book, book_fields=book_fields) + self.index_content(book, book_fields=book_fields, legacy=legacy) master_tags = [ 'opowiadanie', @@ -302,7 +325,7 @@ class Index(SolrIndex): fields = {} if book_info is None: - book_info = dcparser.parse(open(book.xml_file.path)) + book_info = dcparser.parse(open(book.xml_file.path, 'rb')) fields['slug'] = book.slug fields['is_book'] = True @@ -314,21 +337,20 @@ class Index(SolrIndex): if hasattr(book_info, field.name): if not getattr(book_info, field.name): continue - # since no type information is available, we use validator - type_indicator = field.validator - if type_indicator == dcparser.as_unicode: + type_indicator = field.value_type + if issubclass(type_indicator, librarian.meta.types.text.TextValue): s = getattr(book_info, field.name) if field.multiple: s = ', '.join(s) fields[field.name] = s - elif type_indicator == dcparser.as_person: + elif issubclass(type_indicator, librarian.meta.types.person.Person): p = getattr(book_info, field.name) - if isinstance(p, dcparser.Person): - persons = unicode(p) + if isinstance(p, librarian.meta.types.person.Person): + persons = str(p) else: - persons = ', '.join(map(unicode, p)) + persons = ', '.join(map(str, p)) fields[field.name] = persons - elif type_indicator == dcparser.as_date: + elif issubclass(type_indicator, librarian.meta.types.date.DateValue): dt = getattr(book_info, field.name) fields[field.name] = dt @@ -362,7 +384,7 @@ class Index(SolrIndex): if master.tag in self.master_tags: return master - def index_content(self, book, book_fields): + def index_content(self, book, book_fields, legacy=True): """ Walks the book XML and extract content from it. Adds parts for each header tag and for each fragment. @@ -389,16 +411,16 @@ class Index(SolrIndex): return def fix_format(text): - # separator = [u" ", u"\t", u".", u";", u","] + # separator = [" ", "\t", ".", ";", ","] if isinstance(text, list): # need to join it first text = filter(lambda s: s is not None, content) - text = u' '.join(text) + text = ' '.join(text) # for i in range(len(text)): # if i > 0: # if text[i][0] not in separator\ # and text[i - 1][-1] not in separator: - # text.insert(i, u" ") + # text.insert(i, " ") return re.sub("(?m)/$", "", text) @@ -462,9 +484,10 @@ class Index(SolrIndex): elif end is not None and footnote is not [] and end.tag in self.footnote_tags: handle_text.pop() doc = add_part(snippets, header_index=position, header_type=header.tag, - text=u''.join(footnote), - is_footnote=True) - self.index.add(doc) + text=''.join(footnote)) + self.add_snippet(book, doc) + if legacy: + self.index.add(doc) footnote = [] # handle fragments and themes. @@ -478,7 +501,7 @@ class Index(SolrIndex): fid = start.attrib['id'][1:] handle_text.append(lambda text: None) if start.text is not None: - fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(',')))) + fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(',')))) elif end is not None and end.tag == 'motyw': handle_text.pop() @@ -498,7 +521,10 @@ class Index(SolrIndex): fragment_anchor=fid, text=fix_format(frag['text']), themes=frag['themes']) - self.index.add(doc) + # Add searchable fragment + self.add_snippet(book, doc) + if legacy: + self.index.add(doc) # Collect content. @@ -510,7 +536,9 @@ class Index(SolrIndex): doc = add_part(snippets, header_index=position, header_type=header.tag, text=fix_format(content)) - self.index.add(doc) + self.add_snippet(book, doc) + if legacy: + self.index.add(doc) finally: snippets.close() @@ -557,6 +585,7 @@ class Index(SolrIndex): self.index.add(doc) +@total_ordering class SearchResult(object): def __init__(self, doc, how_found=None, query_terms=None): self.boost = 1.0 @@ -610,14 +639,14 @@ class SearchResult(object): result._book = book return result - def __unicode__(self): - return u"" % \ + def __str__(self): + return "" % \ (self.book_id, len(self._hits), len(self._processed_hits) if self._processed_hits else -1, self._score, len(self.snippets)) - def __str__(self): - return unicode(self).encode('utf-8') + def __bytes__(self): + return str(self).encode('utf-8') @property def score(self): @@ -633,7 +662,10 @@ class SearchResult(object): def get_book(self): if self._book is not None: return self._book - self._book = catalogue.models.Book.objects.get(id=self.book_id) + try: + self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True) + except catalogue.models.Book.DoesNotExist: + self._book = None return self._book book = property(get_book) @@ -653,27 +685,25 @@ class SearchResult(object): # to sections and fragments frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits) - sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits) + sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None] # sections not covered by fragments - sect = filter(lambda s: 0 == len(filter( + sect = filter(lambda s: 0 == len(list(filter( lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] < - f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect) + f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect) - def remove_duplicates(lst, keyfn, compare): + def remove_duplicates(lst, keyfn, larger): els = {} for e in lst: eif = keyfn(e) if eif in els: - if compare(els[eif], e) >= 1: + if larger(els[eif], e): continue els[eif] = e return els.values() # remove fragments with duplicated fid's and duplicated snippets - frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE])) - # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT], - # lambda a, b: cmp(a[SCORE], b[SCORE])) + frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE]) # remove duplicate sections sections = {} @@ -691,7 +721,7 @@ class SearchResult(object): m.update(s[self.OTHER]) sections[si] = m - hits = sections.values() + hits = list(sections.values()) for f in frags: try: @@ -705,19 +735,19 @@ class SearchResult(object): if self.query_terms is not None: for i in range(0, len(f[self.OTHER]['themes'])): tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ') - tms = map(unicode.lower, tms) + tms = map(str.lower, tms) for qt in self.query_terms: if qt in tms: themes_hit.add(f[self.OTHER]['themes'][i]) break def theme_by_name(n): - th = filter(lambda t: t.name == n, themes) + th = list(filter(lambda t: t.name == n, themes)) if th: return th[0] else: return None - themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit)) + themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit))) m = {'score': f[self.SCORE], 'fragment': frag, @@ -745,13 +775,17 @@ class SearchResult(object): books[r.book_id] = r return books.values() - def __cmp__(self, other): - c = cmp(self.score, other.score) - if c == 0: - # this is inverted, because earlier date is better - return cmp(other.published_date, self.published_date) - else: - return c + def get_sort_key(self): + return (-self.score, + self.published_date, + self.book.sort_key_author if self.book else '', + self.book.sort_key if self.book else '') + + def __lt__(self, other): + return self.get_sort_key() > other.get_sort_key() + + def __eq__(self, other): + return self.get_sort_key() == other.get_sort_key() def __len__(self): return len(self.hits) @@ -766,6 +800,7 @@ class SearchResult(object): return None +@total_ordering class PictureResult(object): def __init__(self, doc, how_found=None, query_terms=None): self.boost = 1.0 @@ -791,11 +826,11 @@ class PictureResult(object): self._hits.append(hit) - def __unicode__(self): - return u"" % (self.picture_id, self._score) + def __str__(self): + return "" % (self.picture_id, self._score) def __repr__(self): - return unicode(self) + return str(self) @property def score(self): @@ -829,7 +864,7 @@ class PictureResult(object): if self.query_terms is not None: for i in range(0, len(hit[self.OTHER]['themes'])): tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ') - tms = map(unicode.lower, tms) + tms = map(str.lower, tms) for qt in self.query_terms: if qt in tms: themes_hit.add(hit[self.OTHER]['themes'][i]) @@ -866,8 +901,11 @@ class PictureResult(object): books[r.picture_id] = r return books.values() - def __cmp__(self, other): - return cmp(self.score, other.score) + def __lt__(self, other): + return self.score < other.score + + def __eq__(self, other): + return self.score == other.score class Search(SolrIndex): @@ -892,7 +930,7 @@ class Search(SolrIndex): def search_by_author(self, words): from catalogue.models import Book - books = Book.objects.filter(parent=None).order_by('-popularity__count') + books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count') for word in words: books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count') return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]] @@ -957,14 +995,16 @@ class Search(SolrIndex): text = snippets.get((int(position), int(length))) snip = self.index.highlight(text=text, field=field, q=query) + if not snip and field == 'text': + snip = self.index.highlight(text=text, field='text_nonstem', q=query) if snip not in snips: snips[idx] = snip if snip: num -= 1 idx += 1 - except IOError, e: - book = catalogue.models.Book.objects.filter(id=book_id) + except IOError as e: + book = catalogue.models.Book.objects.filter(id=book_id, findable=True) if not book: log.error("Book does not exist for book id = %d" % book_id) elif not book.get().children.exists(): @@ -973,8 +1013,8 @@ class Search(SolrIndex): finally: snippets.close() - # remove verse end markers.. - snips = map(lambda s: s and s.replace("/\n", "\n"), snips) + # remove verse end markers.. + snips = [s.replace("/\n", "\n") if s else s for s in snips] searchresult.snippets = snips