# -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
from django.conf import settings
import os
import custom
import operator
+log = logging.getLogger('search')
class SolrIndex(object):
def __init__(self, mode=None):
def close(self):
"""Close snippet file"""
- self.file.close()
+ if self.file:
+ self.file.close()
def remove(self):
self.revision = None
Class indexing books.
"""
def __init__(self):
- super(Index, self).__init__()
+ super(Index, self).__init__(mode='rw')
def delete_query(self, *queries):
"""
for res in ids:
uids.add(res['uid'])
st += rows
- # print "Will delete %s" % ','.join([x for x in uids])
if uids:
self.index.delete(uids)
return True
Removes all tags from index, then index them again.
Indexed fields include: id, name (with and without polish stems), category
"""
+ log.debug("Indexing tags")
remove_only = kw.get('remove_only', False)
# first, remove tags from index.
if tags:
q_id_cat = self.index.Q(q_id & q_cat)
tag_qs.append(q_id_cat)
- self.delete_query(tag_qs)
+ self.delete_query(*tag_qs)
else: # all
q = self.index.Q(tag_id__any=True)
self.delete_query(q)
"uid": "tag%d" % tag.id
}
self.index.add(doc)
- print "%s %s" % (doc['tag_name'], doc['tag_category'])
def create_book_doc(self, book):
"""
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
- meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+ meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
# let's not index it - it's only used for extracting publish date
if 'source_name' in meta_fields:
del meta_fields['source_name']
'authors': meta_fields['authors'],
'published_date': meta_fields['published_date']
}
+
if 'translators' in meta_fields:
book_fields['translators'] = meta_fields['translators']
doc = add_part(snippets, header_index=position, header_type=header.tag,
text=u''.join(footnote),
is_footnote=True)
-
self.index.add(doc)
- #print "@ footnote text: %s" % footnote
footnote = []
# handle fragments and themes.
fragment_anchor=fid,
text=fix_format(frag['text']),
themes=frag['themes'])
- #print '@ FRAG %s' % frag['content']
self.index.add(doc)
# Collect content.
# in the end, add a section text.
doc = add_part(snippets, header_index=position,
header_type=header.tag, text=fix_format(content))
- #print '@ CONTENT: %s' % fix_format(content)
self.index.add(doc)
class SearchResult(object):
- def __init__(self, doc, how_found=None, query=None):
+ def __init__(self, doc, how_found=None, query=None, query_terms=None):
# self.search = search
self.boost = 1.0
self._hits = []
self._processed_hits = None # processed hits
self.snippets = []
+ self.query_terms = query_terms
if 'score' in doc:
self._score = doc['score']
header_span = header_span is not None and int(header_span) or 1
fragment = doc.get("fragment_anchor", None)
snippets_pos = (doc['snippets_position'], doc['snippets_length'])
- snippets_rev = doc['snippets_revision']
+ snippets_rev = doc.get('snippets_revision', None)
hit = (sec + (header_span,), fragment, self._score, {
'how_found': how_found,
'snippets_pos': snippets_pos,
- 'snippets_revision': snippets_rev
+ 'snippets_revision': snippets_rev,
+ 'themes': doc.get('themes', []),
+ 'themes_pl': doc.get('themes_pl', [])
})
self._hits.append(hit)
def __unicode__(self):
- return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
+ return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
(self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
-
+
def __str__(self):
return unicode(self).encode('utf-8')
except catalogue.models.Fragment.DoesNotExist:
# stale index
continue
-
# Figure out if we were searching for a token matching some word in theme name.
themes = frag.tags.filter(category='theme')
- themes_hit = []
- # if self.searched is not None:
- # tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
- # for theme in themes:
- # name_tokens = self.search.get_tokens(theme.name, 'POLISH')
- # for t in tokens:
- # if t in name_tokens:
- # if not theme in themes_hit:
- # themes_hit.append(theme)
- # break
+ themes_hit = set()
+ if self.query_terms is not None:
+ for i in range(0, len(f[self.OTHER]['themes'])):
+ tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+ tms = map(unicode.lower, tms)
+ for qt in self.query_terms:
+ if qt in tms:
+ themes_hit.add(f[self.OTHER]['themes'][i])
+ break
+
+ def theme_by_name(n):
+ th = filter(lambda t: t.name == n, themes)
+ if th:
+ return th[0]
+ else:
+ return None
+ themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
m = {'score': f[self.SCORE],
'fragment': frag,
Search facilities.
"""
def __init__(self, default_field="text"):
- super(Search, self).__init__()
+ super(Search, self).__init__(mode='r')
- # def get_tokens(self, searched, field='text', cached=None):
- # """returns tokens analyzed by a proper (for a field) analyzer
- # argument can be: StringReader, string/unicode, or tokens. In the last case
- # they will just be returned (so we can reuse tokens, if we don't change the analyzer)
- # """
- # if cached is not None and field in cached:
- # return cached[field]
-
- # if isinstance(searched, str) or isinstance(searched, unicode):
- # searched = StringReader(searched)
- # elif isinstance(searched, list):
- # return searched
-
- # searched.reset()
- # tokens = self.analyzer.reusableTokenStream(field, searched)
- # toks = []
- # while tokens.incrementToken():
- # cta = tokens.getAttribute(CharTermAttribute.class_)
- # toks.append(cta.toString())
-
- # if cached is not None:
- # cached[field] = toks
-
- # return toks
-
- # @staticmethod
- # def fuzziness(fuzzy):
- # """Helper method to sanitize fuzziness"""
- # if not fuzzy:
- # return None
- # if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
- # return fuzzy
- # else:
- # return 0.5
-
- # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
- # """
- # Return a PhraseQuery with a series of tokens.
- # """
- # if fuzzy:
- # phrase = MultiPhraseQuery()
- # for t in tokens:
- # term = Term(field, t)
- # fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
- # fuzzterms = []
-
- # while True:
- # ft = fuzzterm.term()
- # if ft:
- # fuzzterms.append(ft)
- # if not fuzzterm.next(): break
- # if fuzzterms:
- # phrase.add(JArray('object')(fuzzterms, Term))
- # else:
- # phrase.add(term)
- # else:
- # phrase = PhraseQuery()
- # phrase.setSlop(slop)
- # for t in tokens:
- # term = Term(field, t)
- # phrase.add(term)
- # return phrase
def make_term_query(self, query, field='text', modal=operator.or_):
"""
modal - applies to boolean query
fuzzy - should the query by fuzzy.
"""
+ if query is None: query = ''
q = self.index.Q()
q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
query.split(r" ")), q)
return [SearchResult(found, how_found=u'search_phrase') for found in res]
def search_some(self, searched, fields, book=True,
- filters=None,
- snippets=True):
+ filters=None, snippets=True, query_terms=None):
assert isinstance(fields, list)
if filters is None: filters = []
if book: filters.append(self.index.Q(is_book=True))
query = self.index.query(query)
query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
res = query.execute()
- return [SearchResult(found, how_found='search_some') for found in res]
-
- # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
- # """
- # Search for perfect book matches. Just see if the query matches with some author or title,
- # taking hints into account.
- # """
- # fields_to_search = ['authors', 'title']
- # only_in = None
- # if hint:
- # if not hint.should_search_for_book():
- # return []
- # fields_to_search = hint.just_search_in(fields_to_search)
- # only_in = hint.book_filter()
-
- # qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
- # books = []
- # for q in qrys:
- # top = self.searcher.search(q,
- # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, how_found="search_perfect_book"))
- # return books
-
- # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
- # fields_to_search = ['tags', 'authors', 'title']
-
- # only_in = None
- # if hint:
- # if not hint.should_search_for_book():
- # return []
- # fields_to_search = hint.just_search_in(fields_to_search)
- # only_in = hint.book_filter()
-
- # tokens = self.get_tokens(searched, field='SIMPLE')
-
- # q = BooleanQuery()
-
- # for fld in fields_to_search:
- # q.add(BooleanClause(self.make_term_query(tokens, field=fld,
- # fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
- # books = []
- # top = self.searcher.search(q,
- # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, how_found="search_book"))
-
- # return books
-
- # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
- # """
- # Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
- # some part/fragment of the book.
- # """
- # qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
+ return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
- # flt = None
- # if hint:
- # flt = hint.part_filter()
- # books = []
- # for q in qrys:
- # top = self.searcher.search(q,
- # self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
- # flt]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
- # return books
-
- def search_everywhere(self, searched):
+ def search_everywhere(self, searched, query_terms=None):
"""
Tries to use search terms to match different fields of book (or its parts).
E.g. one word can be an author survey, another be a part of the title, and the rest
"""
books = []
# content only query : themes x content
-
q = self.make_term_query(searched, 'text')
q_themes = self.make_term_query(searched, 'themes_pl')
res = query.execute()
for found in res:
- books.append(SearchResult(found, how_found='search_everywhere_themesXcontent'))
+ books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
# query themes/content x author/title/tags
in_content = self.index.Q()
q = in_content & in_meta
res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
+
for found in res:
- books.append(SearchResult(found, how_found='search_everywhere'))
+ books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
return books
continue
text = snippets.get((int(position),
int(length)))
- print "== %s -- %s ==" % (query, text)
snip = self.index.highlight(text=text, field=field, q=query)
snips[idx] = snip
if snip:
idx += 1
except IOError, e:
- log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+ log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
return []
finally:
snippets.close()
snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
searchresult.snippets = snips
+
return snips
def hint_tags(self, query, pdcounter=True, prefix=True):
res = self.apply_filters(query, filters).execute()
tags = []
+ pd_tags = []
+
for doc in res:
is_pdcounter = doc.get('is_pdcounter', False)
category = doc.get('tag_category')
tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
tag.category = 'pd_book' # make it look more lik a tag.
else:
- print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
+ print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+ pd_tags.append(tag)
else:
tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
- # don't add the pdcounter tag if same tag already exists
-
- tags.append(tag)
+ tags.append(tag)
except catalogue.models.Tag.DoesNotExist: pass
except PDCounterAuthor.DoesNotExist: pass
except PDCounterBook.DoesNotExist: pass
+ tags_slugs = set(map(lambda t: t.slug, tags))
+ tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
+
log.debug('search_tags: %s' % tags)
return tags
Searches for Book objects using query
"""
bks = []
+ bks_found = set()
+ query = query.query(is_book=True)
res = self.apply_filters(query, filters).field_limit(['book_id'])
for r in res:
try:
- bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
+ bid = r['book_id']
+ if not bid in bks_found:
+ bks.append(catalogue.models.Book.objects.get(id=bid))
+ bks_found.add(bid)
except catalogue.models.Book.DoesNotExist: pass
return bks
- # def make_prefix_phrase(self, toks, field):
- # q = MultiPhraseQuery()
- # for i in range(len(toks)):
- # t = Term(field, toks[i])
- # if i == len(toks) - 1:
- # pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
- # if pterms:
- # q.add(pterms)
- # else:
- # q.add(t)
- # else:
- # q.add(t)
- # return q
-
- # @staticmethod
- # def term_filter(term, inverse=False):
- # only_term = TermsFilter()
- # only_term.addTerm(term)
-
- # if inverse:
- # neg = BooleanFilter()
- # neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
- # only_term = neg
-
- # return only_term
-
-
@staticmethod
def apply_filters(query, filters):
for f in filters:
query = query.query(f)
return query
-
- # def filtered_categories(self, tags):
- # """
- # Return a list of tag categories, present in tags list.
- # """
- # cats = {}
- # for t in tags:
- # cats[t.category] = True
- # return cats.keys()
-
- # def hint(self):
- # return Hint(self)