# -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
from django.conf import settings
import os
import custom
import operator
+log = logging.getLogger('search')
class SolrIndex(object):
def __init__(self, mode=None):
def close(self):
"""Close snippet file"""
- self.file.close()
+ if self.file:
+ self.file.close()
def remove(self):
self.revision = None
Class indexing books.
"""
def __init__(self):
- super(Index, self).__init__()
+ super(Index, self).__init__(mode='rw')
def delete_query(self, *queries):
"""
for res in ids:
uids.add(res['uid'])
st += rows
- # print "Will delete %s" % ','.join([x for x in uids])
if uids:
self.index.delete(uids)
return True
Removes all tags from index, then index them again.
Indexed fields include: id, name (with and without polish stems), category
"""
+ log.debug("Indexing tags")
remove_only = kw.get('remove_only', False)
# first, remove tags from index.
if tags:
q_id_cat = self.index.Q(q_id & q_cat)
tag_qs.append(q_id_cat)
- self.delete_query(tag_qs)
+ self.delete_query(*tag_qs)
else: # all
q = self.index.Q(tag_id__any=True)
self.delete_query(q)
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
- meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+ meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
# let's not index it - it's only used for extracting publish date
if 'source_name' in meta_fields:
del meta_fields['source_name']
'authors': meta_fields['authors'],
'published_date': meta_fields['published_date']
}
+
if 'translators' in meta_fields:
book_fields['translators'] = meta_fields['translators']
doc = add_part(snippets, header_index=position, header_type=header.tag,
text=u''.join(footnote),
is_footnote=True)
-
self.index.add(doc)
- #print "@ footnote text: %s" % footnote
footnote = []
# handle fragments and themes.
fragment_anchor=fid,
text=fix_format(frag['text']),
themes=frag['themes'])
- #print '@ FRAG %s' % frag['content']
self.index.add(doc)
# Collect content.
# in the end, add a section text.
doc = add_part(snippets, header_index=position,
header_type=header.tag, text=fix_format(content))
- #print '@ CONTENT: %s' % fix_format(content)
self.index.add(doc)
header_span = header_span is not None and int(header_span) or 1
fragment = doc.get("fragment_anchor", None)
snippets_pos = (doc['snippets_position'], doc['snippets_length'])
- snippets_rev = doc['snippets_revision']
+ snippets_rev = doc.get('snippets_revision', None)
hit = (sec + (header_span,), fragment, self._score, {
'how_found': how_found,
self._hits.append(hit)
def __unicode__(self):
- return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
+ return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
(self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
def __str__(self):
except catalogue.models.Fragment.DoesNotExist:
# stale index
continue
- print f
# Figure out if we were searching for a token matching some word in theme name.
themes = frag.tags.filter(category='theme')
themes_hit = set()
Search facilities.
"""
def __init__(self, default_field="text"):
- super(Search, self).__init__()
+ super(Search, self).__init__(mode='r')
- # def get_tokens(self, searched, field='text', cached=None):
- # """returns tokens analyzed by a proper (for a field) analyzer
- # argument can be: StringReader, string/unicode, or tokens. In the last case
- # they will just be returned (so we can reuse tokens, if we don't change the analyzer)
- # """
- # if cached is not None and field in cached:
- # return cached[field]
-
- # if isinstance(searched, str) or isinstance(searched, unicode):
- # searched = StringReader(searched)
- # elif isinstance(searched, list):
- # return searched
-
- # searched.reset()
- # tokens = self.analyzer.reusableTokenStream(field, searched)
- # toks = []
- # while tokens.incrementToken():
- # cta = tokens.getAttribute(CharTermAttribute.class_)
- # toks.append(cta.toString())
-
- # if cached is not None:
- # cached[field] = toks
-
- # return toks
-
- # @staticmethod
- # def fuzziness(fuzzy):
- # """Helper method to sanitize fuzziness"""
- # if not fuzzy:
- # return None
- # if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
- # return fuzzy
- # else:
- # return 0.5
-
- # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
- # """
- # Return a PhraseQuery with a series of tokens.
- # """
- # if fuzzy:
- # phrase = MultiPhraseQuery()
- # for t in tokens:
- # term = Term(field, t)
- # fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
- # fuzzterms = []
-
- # while True:
- # ft = fuzzterm.term()
- # if ft:
- # fuzzterms.append(ft)
- # if not fuzzterm.next(): break
- # if fuzzterms:
- # phrase.add(JArray('object')(fuzzterms, Term))
- # else:
- # phrase.add(term)
- # else:
- # phrase = PhraseQuery()
- # phrase.setSlop(slop)
- # for t in tokens:
- # term = Term(field, t)
- # phrase.add(term)
- # return phrase
def make_term_query(self, query, field='text', modal=operator.or_):
"""
modal - applies to boolean query
fuzzy - should the query by fuzzy.
"""
+ if query is None: query = ''
q = self.index.Q()
q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
query.split(r" ")), q)
res = query.execute()
return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
- # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
- # """
- # Search for perfect book matches. Just see if the query matches with some author or title,
- # taking hints into account.
- # """
- # fields_to_search = ['authors', 'title']
- # only_in = None
- # if hint:
- # if not hint.should_search_for_book():
- # return []
- # fields_to_search = hint.just_search_in(fields_to_search)
- # only_in = hint.book_filter()
-
- # qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
- # books = []
- # for q in qrys:
- # top = self.searcher.search(q,
- # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, how_found="search_perfect_book"))
- # return books
-
- # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
- # fields_to_search = ['tags', 'authors', 'title']
-
- # only_in = None
- # if hint:
- # if not hint.should_search_for_book():
- # return []
- # fields_to_search = hint.just_search_in(fields_to_search)
- # only_in = hint.book_filter()
-
- # tokens = self.get_tokens(searched, field='SIMPLE')
-
- # q = BooleanQuery()
-
- # for fld in fields_to_search:
- # q.add(BooleanClause(self.make_term_query(tokens, field=fld,
- # fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
- # books = []
- # top = self.searcher.search(q,
- # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, how_found="search_book"))
-
- # return books
-
- # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
- # """
- # Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
- # some part/fragment of the book.
- # """
- # qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
-
- # flt = None
- # if hint:
- # flt = hint.part_filter()
-
- # books = []
- # for q in qrys:
- # top = self.searcher.search(q,
- # self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
- # flt]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
- # return books
def search_everywhere(self, searched, query_terms=None):
"""
continue
text = snippets.get((int(position),
int(length)))
- print "== %s -- %s ==" % (query, text)
snip = self.index.highlight(text=text, field=field, q=query)
snips[idx] = snip
if snip:
idx += 1
except IOError, e:
- log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+ log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
return []
finally:
snippets.close()
snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
searchresult.snippets = snips
+
return snips
def hint_tags(self, query, pdcounter=True, prefix=True):
res = self.apply_filters(query, filters).execute()
tags = []
+ pd_tags = []
+
for doc in res:
is_pdcounter = doc.get('is_pdcounter', False)
category = doc.get('tag_category')
tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
tag.category = 'pd_book' # make it look more lik a tag.
else:
- print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
+ print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+ pd_tags.append(tag)
else:
tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
- # don't add the pdcounter tag if same tag already exists
-
- tags.append(tag)
+ tags.append(tag)
except catalogue.models.Tag.DoesNotExist: pass
except PDCounterAuthor.DoesNotExist: pass
except PDCounterBook.DoesNotExist: pass
+ tags_slugs = set(map(lambda t: t.slug, tags))
+ tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
+
log.debug('search_tags: %s' % tags)
return tags
Searches for Book objects using query
"""
bks = []
+ bks_found = set()
+ query = query.query(is_book=True)
res = self.apply_filters(query, filters).field_limit(['book_id'])
for r in res:
try:
- bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
+ bid = r['book_id']
+ if not bid in bks_found:
+ bks.append(catalogue.models.Book.objects.get(id=bid))
+ bks_found.add(bid)
except catalogue.models.Book.DoesNotExist: pass
return bks
- # def make_prefix_phrase(self, toks, field):
- # q = MultiPhraseQuery()
- # for i in range(len(toks)):
- # t = Term(field, toks[i])
- # if i == len(toks) - 1:
- # pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
- # if pterms:
- # q.add(pterms)
- # else:
- # q.add(t)
- # else:
- # q.add(t)
- # return q
-
- # @staticmethod
- # def term_filter(term, inverse=False):
- # only_term = TermsFilter()
- # only_term.addTerm(term)
-
- # if inverse:
- # neg = BooleanFilter()
- # neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
- # only_term = neg
-
- # return only_term
-
-
@staticmethod
def apply_filters(query, filters):
for f in filters:
query = query.query(f)
return query
-
- # def filtered_categories(self, tags):
- # """
- # Return a list of tag categories, present in tags list.
- # """
- # cats = {}
- # for t in tags:
- # cats[t.category] = True
- # return cats.keys()
-
- # def hint(self):
- # return Hint(self)