X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/8b808f24709efb16f6b6eff6abb05b41341573c7..99f6dd4f1ff0390de9a9bbd4e3352b953cb9a235:/apps/search/index.py diff --git a/apps/search/index.py b/apps/search/index.py index 32cf5f922..7fb60b508 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- - +# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# from django.conf import settings import os @@ -18,6 +20,7 @@ import sunburnt import custom import operator +log = logging.getLogger('search') class SolrIndex(object): def __init__(self, mode=None): @@ -92,7 +95,8 @@ class Snippets(object): def close(self): """Close snippet file""" - self.file.close() + if self.file: + self.file.close() def remove(self): self.revision = None @@ -111,7 +115,7 @@ class Index(SolrIndex): Class indexing books. """ def __init__(self): - super(Index, self).__init__() + super(Index, self).__init__(mode='rw') def delete_query(self, *queries): """ @@ -132,7 +136,6 @@ class Index(SolrIndex): for res in ids: uids.add(res['uid']) st += rows - # print "Will delete %s" % ','.join([x for x in uids]) if uids: self.index.delete(uids) return True @@ -145,6 +148,7 @@ class Index(SolrIndex): Removes all tags from index, then index them again. Indexed fields include: id, name (with and without polish stems), category """ + log.debug("Indexing tags") remove_only = kw.get('remove_only', False) # first, remove tags from index. if tags: @@ -161,7 +165,7 @@ class Index(SolrIndex): q_id_cat = self.index.Q(q_id & q_cat) tag_qs.append(q_id_cat) - self.delete_query(tag_qs) + self.delete_query(*tag_qs) else: # all q = self.index.Q(tag_id__any=True) self.delete_query(q) @@ -202,7 +206,6 @@ class Index(SolrIndex): "uid": "tag%d" % tag.id } self.index.add(doc) - print "%s %s" % (doc['tag_name'], doc['tag_category']) def create_book_doc(self, book): """ @@ -241,7 +244,7 @@ class Index(SolrIndex): self.remove_book(book, remove_snippets=False) book_doc = self.create_book_doc(book) - meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title']) + meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title']) # let's not index it - it's only used for extracting publish date if 'source_name' in meta_fields: del meta_fields['source_name'] @@ -257,6 +260,7 @@ class Index(SolrIndex): 'authors': meta_fields['authors'], 'published_date': meta_fields['published_date'] } + if 'translators' in meta_fields: book_fields['translators'] = meta_fields['translators'] @@ -461,9 +465,7 @@ class Index(SolrIndex): doc = add_part(snippets, header_index=position, header_type=header.tag, text=u''.join(footnote), is_footnote=True) - self.index.add(doc) - #print "@ footnote text: %s" % footnote footnote = [] # handle fragments and themes. @@ -496,7 +498,6 @@ class Index(SolrIndex): fragment_anchor=fid, text=fix_format(frag['text']), themes=frag['themes']) - #print '@ FRAG %s' % frag['content'] self.index.add(doc) # Collect content. @@ -509,7 +510,6 @@ class Index(SolrIndex): # in the end, add a section text. doc = add_part(snippets, header_index=position, header_type=header.tag, text=fix_format(content)) - #print '@ CONTENT: %s' % fix_format(content) self.index.add(doc) @@ -518,12 +518,13 @@ class Index(SolrIndex): class SearchResult(object): - def __init__(self, doc, how_found=None, query=None): + def __init__(self, doc, how_found=None, query=None, query_terms=None): # self.search = search self.boost = 1.0 self._hits = [] self._processed_hits = None # processed hits self.snippets = [] + self.query_terms = query_terms if 'score' in doc: self._score = doc['score'] @@ -546,20 +547,22 @@ class SearchResult(object): header_span = header_span is not None and int(header_span) or 1 fragment = doc.get("fragment_anchor", None) snippets_pos = (doc['snippets_position'], doc['snippets_length']) - snippets_rev = doc['snippets_revision'] + snippets_rev = doc.get('snippets_revision', None) hit = (sec + (header_span,), fragment, self._score, { 'how_found': how_found, 'snippets_pos': snippets_pos, - 'snippets_revision': snippets_rev + 'snippets_revision': snippets_rev, + 'themes': doc.get('themes', []), + 'themes_pl': doc.get('themes_pl', []) }) self._hits.append(hit) def __unicode__(self): - return u"" % \ (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets)) - + def __str__(self): return unicode(self).encode('utf-8') @@ -647,19 +650,25 @@ class SearchResult(object): except catalogue.models.Fragment.DoesNotExist: # stale index continue - # Figure out if we were searching for a token matching some word in theme name. themes = frag.tags.filter(category='theme') - themes_hit = [] - # if self.searched is not None: - # tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache) - # for theme in themes: - # name_tokens = self.search.get_tokens(theme.name, 'POLISH') - # for t in tokens: - # if t in name_tokens: - # if not theme in themes_hit: - # themes_hit.append(theme) - # break + themes_hit = set() + if self.query_terms is not None: + for i in range(0, len(f[self.OTHER]['themes'])): + tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ') + tms = map(unicode.lower, tms) + for qt in self.query_terms: + if qt in tms: + themes_hit.add(f[self.OTHER]['themes'][i]) + break + + def theme_by_name(n): + th = filter(lambda t: t.name == n, themes) + if th: + return th[0] + else: + return None + themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit)) m = {'score': f[self.SCORE], 'fragment': frag, @@ -713,70 +722,8 @@ class Search(SolrIndex): Search facilities. """ def __init__(self, default_field="text"): - super(Search, self).__init__() + super(Search, self).__init__(mode='r') - # def get_tokens(self, searched, field='text', cached=None): - # """returns tokens analyzed by a proper (for a field) analyzer - # argument can be: StringReader, string/unicode, or tokens. In the last case - # they will just be returned (so we can reuse tokens, if we don't change the analyzer) - # """ - # if cached is not None and field in cached: - # return cached[field] - - # if isinstance(searched, str) or isinstance(searched, unicode): - # searched = StringReader(searched) - # elif isinstance(searched, list): - # return searched - - # searched.reset() - # tokens = self.analyzer.reusableTokenStream(field, searched) - # toks = [] - # while tokens.incrementToken(): - # cta = tokens.getAttribute(CharTermAttribute.class_) - # toks.append(cta.toString()) - - # if cached is not None: - # cached[field] = toks - - # return toks - - # @staticmethod - # def fuzziness(fuzzy): - # """Helper method to sanitize fuzziness""" - # if not fuzzy: - # return None - # if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0: - # return fuzzy - # else: - # return 0.5 - - # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False): - # """ - # Return a PhraseQuery with a series of tokens. - # """ - # if fuzzy: - # phrase = MultiPhraseQuery() - # for t in tokens: - # term = Term(field, t) - # fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy)) - # fuzzterms = [] - - # while True: - # ft = fuzzterm.term() - # if ft: - # fuzzterms.append(ft) - # if not fuzzterm.next(): break - # if fuzzterms: - # phrase.add(JArray('object')(fuzzterms, Term)) - # else: - # phrase.add(term) - # else: - # phrase = PhraseQuery() - # phrase.setSlop(slop) - # for t in tokens: - # term = Term(field, t) - # phrase.add(term) - # return phrase def make_term_query(self, query, field='text', modal=operator.or_): """ @@ -784,6 +731,7 @@ class Search(SolrIndex): modal - applies to boolean query fuzzy - should the query by fuzzy. """ + if query is None: query = '' q = self.index.Q() q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q) @@ -802,8 +750,7 @@ class Search(SolrIndex): return [SearchResult(found, how_found=u'search_phrase') for found in res] def search_some(self, searched, fields, book=True, - filters=None, - snippets=True): + filters=None, snippets=True, query_terms=None): assert isinstance(fields, list) if filters is None: filters = [] if book: filters.append(self.index.Q(is_book=True)) @@ -816,82 +763,10 @@ class Search(SolrIndex): query = self.index.query(query) query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True) res = query.execute() - return [SearchResult(found, how_found='search_some') for found in res] - - # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None): - # """ - # Search for perfect book matches. Just see if the query matches with some author or title, - # taking hints into account. - # """ - # fields_to_search = ['authors', 'title'] - # only_in = None - # if hint: - # if not hint.should_search_for_book(): - # return [] - # fields_to_search = hint.just_search_in(fields_to_search) - # only_in = hint.book_filter() - - # qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search] - - # books = [] - # for q in qrys: - # top = self.searcher.search(q, - # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]), - # max_results) - # for found in top.scoreDocs: - # books.append(SearchResult(self, found, how_found="search_perfect_book")) - # return books - - # def search_book(self, searched, max_results=20, fuzzy=False, hint=None): - # fields_to_search = ['tags', 'authors', 'title'] - - # only_in = None - # if hint: - # if not hint.should_search_for_book(): - # return [] - # fields_to_search = hint.just_search_in(fields_to_search) - # only_in = hint.book_filter() - - # tokens = self.get_tokens(searched, field='SIMPLE') - - # q = BooleanQuery() - - # for fld in fields_to_search: - # q.add(BooleanClause(self.make_term_query(tokens, field=fld, - # fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) - - # books = [] - # top = self.searcher.search(q, - # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]), - # max_results) - # for found in top.scoreDocs: - # books.append(SearchResult(self, found, how_found="search_book")) - - # return books - - # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None): - # """ - # Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase()) - # some part/fragment of the book. - # """ - # qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']] + return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res] - # flt = None - # if hint: - # flt = hint.part_filter() - # books = [] - # for q in qrys: - # top = self.searcher.search(q, - # self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True), - # flt]), - # max_results) - # for found in top.scoreDocs: - # books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts')) - - # return books - - def search_everywhere(self, searched): + def search_everywhere(self, searched, query_terms=None): """ Tries to use search terms to match different fields of book (or its parts). E.g. one word can be an author survey, another be a part of the title, and the rest @@ -899,7 +774,6 @@ class Search(SolrIndex): """ books = [] # content only query : themes x content - q = self.make_term_query(searched, 'text') q_themes = self.make_term_query(searched, 'themes_pl') @@ -907,7 +781,7 @@ class Search(SolrIndex): res = query.execute() for found in res: - books.append(SearchResult(found, how_found='search_everywhere_themesXcontent')) + books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms)) # query themes/content x author/title/tags in_content = self.index.Q() @@ -921,8 +795,9 @@ class Search(SolrIndex): q = in_content & in_meta res = self.index.query(q).field_limit(score=True, all_fields=True).execute() + for found in res: - books.append(SearchResult(found, how_found='search_everywhere')) + books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms)) return books @@ -946,7 +821,6 @@ class Search(SolrIndex): continue text = snippets.get((int(position), int(length))) - print "== %s -- %s ==" % (query, text) snip = self.index.highlight(text=text, field=field, q=query) snips[idx] = snip if snip: @@ -954,7 +828,7 @@ class Search(SolrIndex): idx += 1 except IOError, e: - log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e)) + log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e)) return [] finally: snippets.close() @@ -963,6 +837,7 @@ class Search(SolrIndex): snips = map(lambda s: s and s.replace("/\n", "\n"), snips) searchresult.snippets = snips + return snips def hint_tags(self, query, pdcounter=True, prefix=True): @@ -991,6 +866,8 @@ class Search(SolrIndex): res = self.apply_filters(query, filters).execute() tags = [] + pd_tags = [] + for doc in res: is_pdcounter = doc.get('is_pdcounter', False) category = doc.get('tag_category') @@ -1002,17 +879,19 @@ class Search(SolrIndex): tag = PDCounterBook.objects.get(id=doc.get('tag_id')) tag.category = 'pd_book' # make it look more lik a tag. else: - print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category) + print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8') + pd_tags.append(tag) else: tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id")) - # don't add the pdcounter tag if same tag already exists - - tags.append(tag) + tags.append(tag) except catalogue.models.Tag.DoesNotExist: pass except PDCounterAuthor.DoesNotExist: pass except PDCounterBook.DoesNotExist: pass + tags_slugs = set(map(lambda t: t.slug, tags)) + tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags) + log.debug('search_tags: %s' % tags) return tags @@ -1038,40 +917,18 @@ class Search(SolrIndex): Searches for Book objects using query """ bks = [] + bks_found = set() + query = query.query(is_book=True) res = self.apply_filters(query, filters).field_limit(['book_id']) for r in res: try: - bks.append(catalogue.models.Book.objects.get(id=r['book_id'])) + bid = r['book_id'] + if not bid in bks_found: + bks.append(catalogue.models.Book.objects.get(id=bid)) + bks_found.add(bid) except catalogue.models.Book.DoesNotExist: pass return bks - # def make_prefix_phrase(self, toks, field): - # q = MultiPhraseQuery() - # for i in range(len(toks)): - # t = Term(field, toks[i]) - # if i == len(toks) - 1: - # pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t)) - # if pterms: - # q.add(pterms) - # else: - # q.add(t) - # else: - # q.add(t) - # return q - - # @staticmethod - # def term_filter(term, inverse=False): - # only_term = TermsFilter() - # only_term.addTerm(term) - - # if inverse: - # neg = BooleanFilter() - # neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT)) - # only_term = neg - - # return only_term - - @staticmethod def apply_filters(query, filters): @@ -1083,15 +940,3 @@ class Search(SolrIndex): for f in filters: query = query.query(f) return query - - # def filtered_categories(self, tags): - # """ - # Return a list of tag categories, present in tags list. - # """ - # cats = {} - # for t in tags: - # cats[t.category] = True - # return cats.keys() - - # def hint(self): - # return Hint(self)