# -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
from django.conf import settings
import os
import logging
log = logging.getLogger('search')
import sunburnt
-import highlight
+import custom
+import operator
+log = logging.getLogger('search')
class SolrIndex(object):
def __init__(self, mode=None):
- self.index = highlight.HLSolrInterface(settings.SOLR, mode=mode)
+ self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
class Snippets(object):
def close(self):
"""Close snippet file"""
- self.file.close()
+ if self.file:
+ self.file.close()
def remove(self):
self.revision = None
Class indexing books.
"""
def __init__(self):
- super(Index, self).__init__()
+ super(Index, self).__init__(mode='rw')
def delete_query(self, *queries):
"""
break
for res in ids:
uids.add(res['uid'])
- st+=rows
- # print "Will delete %s" % ','.join([x for x in uids])
+ st += rows
if uids:
self.index.delete(uids)
return True
Removes all tags from index, then index them again.
Indexed fields include: id, name (with and without polish stems), category
"""
+ log.debug("Indexing tags")
remove_only = kw.get('remove_only', False)
# first, remove tags from index.
if tags:
q_id_cat = self.index.Q(q_id & q_cat)
tag_qs.append(q_id_cat)
- self.delete_query(tag_qs)
+ self.delete_query(*tag_qs)
else: # all
q = self.index.Q(tag_id__any=True)
self.delete_query(q)
"tag_name": tag.name,
"tag_name_pl": tag.name,
"tag_category": 'pd_author',
- "is_pdcounter": True
+ "is_pdcounter": True,
+ "uid": "tag%d_pd_a" % tag.id
}
elif isinstance(tag, PDCounterBook):
doc = {
"tag_name": tag.title,
"tag_name_pl": tag.title,
"tag_category": 'pd_book',
- "is_pdcounter": True
+ "is_pdcounter": True,
+ "uid": "tag%d_pd_b" % tag.id
}
else:
doc = {
"tag_name": tag.name,
"tag_name_pl": tag.name,
"tag_category": tag.category,
- "is_pdcounter": False
+ "is_pdcounter": False,
+ "uid": "tag%d" % tag.id
}
- doc['uid'] = "tag%d" % tag.id
self.index.add(doc)
def create_book_doc(self, book):
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
- meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+ meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
# let's not index it - it's only used for extracting publish date
if 'source_name' in meta_fields:
del meta_fields['source_name']
book_doc['uid'] = "book%s" % book_doc['book_id']
self.index.add(book_doc)
del book_doc
-
- self.index_content(book, book_fields={
+ book_fields = {
'title': meta_fields['title'],
'authors': meta_fields['authors'],
- 'published_date': meta_fields['published_date']})
+ 'published_date': meta_fields['published_date']
+ }
+
+ if 'translators' in meta_fields:
+ book_fields['translators'] = meta_fields['translators']
+
+ self.index_content(book, book_fields=book_fields)
master_tags = [
'opowiadanie',
doc['themes'] = fields['themes']
doc['uid'] = "part%s%s%s" % (doc['header_index'],
doc['header_span'],
- doc.get('fragment_anchor',''))
+ doc.get('fragment_anchor', ''))
return doc
def give_me_utf8(s):
doc = add_part(snippets, header_index=position, header_type=header.tag,
text=u''.join(footnote),
is_footnote=True)
-
self.index.add(doc)
- #print "@ footnote text: %s" % footnote
footnote = []
# handle fragments and themes.
fragment_anchor=fid,
text=fix_format(frag['text']),
themes=frag['themes'])
- #print '@ FRAG %s' % frag['content']
self.index.add(doc)
# Collect content.
# in the end, add a section text.
doc = add_part(snippets, header_index=position,
header_type=header.tag, text=fix_format(content))
- #print '@ CONTENT: %s' % fix_format(content)
self.index.add(doc)
snippets.close()
-
class SearchResult(object):
- def __init__(self, search, doc, how_found=None, snippets=None, searched=None, tokens_cache=None):
- if tokens_cache is None: tokens_cache = {}
+ def __init__(self, doc, how_found=None, query=None, query_terms=None):
+ # self.search = search
+ self.boost = 1.0
+ self._hits = []
+ self._processed_hits = None # processed hits
+ self.snippets = []
+ self.query_terms = query_terms
if 'score' in doc:
self._score = doc['score']
else:
self._score = 0
- self.boost = 1.0
-
- self._hits = []
- self._processed_hits = None # processed hits
-
self.book_id = int(doc["book_id"])
- pd = doc["published_date"]
try:
- self.published_date = int(pd)
+ self.published_date = int(doc.get("published_date"))
except ValueError:
self.published_date = 0
+ # content hits
header_type = doc.get("header_type", None)
# we have a content hit in some header of fragment
if header_type is not None:
sec = (header_type, int(doc["header_index"]))
header_span = doc['header_span']
header_span = header_span is not None and int(header_span) or 1
-
fragment = doc.get("fragment_anchor", None)
+ snippets_pos = (doc['snippets_position'], doc['snippets_length'])
+ snippets_rev = doc.get('snippets_revision', None)
- if snippets:
- snippets = snippets.replace("/\n", "\n")
- hit = (sec + (header_span,), fragment, self._score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+ hit = (sec + (header_span,), fragment, self._score, {
+ 'how_found': how_found,
+ 'snippets_pos': snippets_pos,
+ 'snippets_revision': snippets_rev,
+ 'themes': doc.get('themes', []),
+ 'themes_pl': doc.get('themes_pl', [])
+ })
self._hits.append(hit)
- self.search = search
- self.searched = searched
- self.tokens_cache = tokens_cache
+ def __unicode__(self):
+ return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
+ (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
+
+ def __str__(self):
+ return unicode(self).encode('utf-8')
@property
def score(self):
def get_book(self):
if hasattr(self, '_book'):
return self._book
- return catalogue.models.Book.objects.get(id=self.book_id)
+ self._book = catalogue.models.Book.objects.get(id=self.book_id)
+ return self._book
book = property(get_book)
+ POSITION = 0
+ FRAGMENT = 1
+ POSITION_INDEX = 1
+ POSITION_SPAN = 2
+ SCORE = 2
+ OTHER = 3
+
@property
def hits(self):
if self._processed_hits is not None:
return self._processed_hits
- POSITION = 0
- FRAGMENT = 1
- POSITION_INDEX = 1
- POSITION_SPAN = 2
- SCORE = 2
- OTHER = 3
-
# to sections and fragments
- frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
+ frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
- sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
+ sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
# sections not covered by fragments
sect = filter(lambda s: 0 == len(filter(
- lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
- and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
+ lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
+ and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
frags)), sect)
hits = []
return els.values()
# remove fragments with duplicated fid's and duplicated snippets
- frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
- frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
- lambda a, b: cmp(a[SCORE], b[SCORE]))
+ frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
+ # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
+ # lambda a, b: cmp(a[SCORE], b[SCORE]))
# remove duplicate sections
sections = {}
for s in sect:
- si = s[POSITION][POSITION_INDEX]
+ si = s[self.POSITION][self.POSITION_INDEX]
# skip existing
if si in sections:
- if sections[si]['score'] >= s[SCORE]:
+ if sections[si]['score'] >= s[self.SCORE]:
continue
- m = {'score': s[SCORE],
- 'section_number': s[POSITION][POSITION_INDEX] + 1,
+ m = {'score': s[self.SCORE],
+ 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
}
- m.update(s[OTHER])
+ m.update(s[self.OTHER])
sections[si] = m
hits = sections.values()
for f in frags:
try:
- frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
+ frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
except catalogue.models.Fragment.DoesNotExist:
# stale index
continue
-
# Figure out if we were searching for a token matching some word in theme name.
themes = frag.tags.filter(category='theme')
- themes_hit = []
- if self.searched is not None:
- tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
- for theme in themes:
- name_tokens = self.search.get_tokens(theme.name, 'POLISH')
- for t in tokens:
- if t in name_tokens:
- if not theme in themes_hit:
- themes_hit.append(theme)
+ themes_hit = set()
+ if self.query_terms is not None:
+ for i in range(0, len(f[self.OTHER]['themes'])):
+ tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+ tms = map(unicode.lower, tms)
+ for qt in self.query_terms:
+ if qt in tms:
+ themes_hit.add(f[self.OTHER]['themes'][i])
break
- m = {'score': f[SCORE],
+ def theme_by_name(n):
+ th = filter(lambda t: t.name == n, themes)
+ if th:
+ return th[0]
+ else:
+ return None
+ themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
+
+ m = {'score': f[self.SCORE],
'fragment': frag,
- 'section_number': f[POSITION][POSITION_INDEX] + 1,
+ 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
'themes': themes,
'themes_hit': themes_hit
}
- m.update(f[OTHER])
+ m.update(f[self.OTHER])
hits.append(m)
hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
return hits
- def __unicode__(self):
- return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
-
@staticmethod
def aggregate(*result_lists):
books = {}
else:
return c
+ def __len__(self):
+ return len(self.hits)
-class Hint(object):
- """
- Given some hint information (information we already know about)
- our search target - like author, title (specific book), epoch, genre, kind
- we can narrow down search using filters.
- """
- def __init__(self, search):
- """
- Accepts a Searcher instance.
- """
- self.search = search
- self.book_tags = {}
- self.part_tags = []
- self._books = []
-
- def books(self, *books):
- """
- Give a hint that we search these books.
- """
- self._books = books
+ def snippet_pos(self, idx=0):
+ return self.hits[idx]['snippets_pos']
- def tags(self, tags):
- """
- Give a hint that these Tag objects (a list of)
- is necessary.
- """
- for t in tags:
- if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
- lst = self.book_tags.get(t.category, [])
- lst.append(t)
- self.book_tags[t.category] = lst
- if t.category in ['theme', 'theme_pl']:
- self.part_tags.append(t)
-
- def tag_filter(self, tags, field='tags'):
- """
- Given a lsit of tags and an optional field (but they are normally in tags field)
- returns a filter accepting only books with specific tags.
- """
- q = BooleanQuery()
-
- for tag in tags:
- toks = self.search.get_tokens(tag.name, field=field)
- tag_phrase = PhraseQuery()
- for tok in toks:
- tag_phrase.add(Term(field, tok))
- q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
-
- return QueryWrapperFilter(q)
-
- def book_filter(self):
- """
- Filters using book tags (all tag kinds except a theme)
- """
- tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
- if tags:
- return self.tag_filter(tags)
- else:
+ def snippet_revision(self, idx=0):
+ try:
+ return self.hits[idx]['snippets_revision']
+ except:
return None
- def part_filter(self):
- """
- This filter can be used to look for book parts.
- It filters on book id and/or themes.
- """
- fs = []
- if self.part_tags:
- fs.append(self.tag_filter(self.part_tags, field='themes'))
-
- if self._books != []:
- bf = BooleanFilter()
- for b in self._books:
- id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
- bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
- fs.append(bf)
-
- return Search.chain_filters(fs)
-
- def should_search_for_book(self):
- return self._books == []
-
- def just_search_in(self, all):
- """Holds logic to figure out which indexes should be search, when we have some hinst already"""
- some = []
- for field in all:
- if field == 'authors' and 'author' in self.book_tags:
- continue
- if field == 'title' and self._books != []:
- continue
- if (field == 'themes' or field == 'themes_pl') and self.part_tags:
- continue
- some.append(field)
- return some
-
class Search(SolrIndex):
"""
Search facilities.
"""
def __init__(self, default_field="text"):
- IndexStore.__init__(self)
- self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
- # self.analyzer = WLAnalyzer()
- reader = IndexReader.open(self.store, True)
- self.searcher = IndexSearcher(reader)
- self.parser = QueryParser(Version.LUCENE_34, default_field,
- self.analyzer)
-
- self.parent_filter = TermsFilter()
- self.parent_filter.addTerm(Term("is_book", "true"))
- index_changed.connect(self.reopen)
-
- def close(self):
- reader = self.searcher.getIndexReader()
- self.searcher.close()
- reader.close()
- super(Search, self).close()
- index_changed.disconnect(self.reopen)
-
- def reopen(self, **unused):
- reader = self.searcher.getIndexReader()
- rdr = reader.reopen()
- if not rdr.equals(reader):
- log.debug('Reopening index')
- oldsearch = self.searcher
- self.searcher = IndexSearcher(rdr)
- oldsearch.close()
- reader.close()
-
- def query(self, query):
- """Parse query in default Lucene Syntax. (for humans)
- """
- return self.parser.parse(query)
-
- def simple_search(self, query, max_results=50):
- """Runs a query for books using lucene syntax. (for humans)
- Returns (books, total_hits)
- """
-
- tops = self.searcher.search(self.query(query), max_results)
- bks = []
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
- bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
- return (bks, tops.totalHits)
-
- def get_tokens(self, searched, field='text', cached=None):
- """returns tokens analyzed by a proper (for a field) analyzer
- argument can be: StringReader, string/unicode, or tokens. In the last case
- they will just be returned (so we can reuse tokens, if we don't change the analyzer)
- """
- if cached is not None and field in cached:
- return cached[field]
-
- if isinstance(searched, str) or isinstance(searched, unicode):
- searched = StringReader(searched)
- elif isinstance(searched, list):
- return searched
-
- searched.reset()
- tokens = self.analyzer.reusableTokenStream(field, searched)
- toks = []
- while tokens.incrementToken():
- cta = tokens.getAttribute(CharTermAttribute.class_)
- toks.append(cta.toString())
-
- if cached is not None:
- cached[field] = toks
-
- return toks
-
- @staticmethod
- def fuzziness(fuzzy):
- """Helper method to sanitize fuzziness"""
- if not fuzzy:
- return None
- if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
- return fuzzy
- else:
- return 0.5
-
- def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
- """
- Return a PhraseQuery with a series of tokens.
- """
- if fuzzy:
- phrase = MultiPhraseQuery()
- for t in tokens:
- term = Term(field, t)
- fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
- fuzzterms = []
+ super(Search, self).__init__(mode='r')
- while True:
- ft = fuzzterm.term()
- if ft:
- fuzzterms.append(ft)
- if not fuzzterm.next(): break
- if fuzzterms:
- phrase.add(JArray('object')(fuzzterms, Term))
- else:
- phrase.add(term)
- else:
- phrase = PhraseQuery()
- phrase.setSlop(slop)
- for t in tokens:
- term = Term(field, t)
- phrase.add(term)
- return phrase
- @staticmethod
- def make_term_query(tokens, field='text', modal='BooleanClause.Occur.SHOULD XXX', fuzzy=False):
+ def make_term_query(self, query, field='text', modal=operator.or_):
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
fuzzy - should the query by fuzzy.
"""
- q = BooleanQuery()
- for t in tokens:
- term = Term(field, t)
- if fuzzy:
- term = FuzzyQuery(term, self.fuzziness(fuzzy))
- else:
- term = TermQuery(term)
- q.add(BooleanClause(term, modal))
+ if query is None: query = ''
+ q = self.index.Q()
+ q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
+ query.split(r" ")), q)
+
return q
- def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
- filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
+ def search_phrase(self, searched, field='text', book=False,
+ filters=None,
+ snippets=False):
if filters is None: filters = []
- if tokens_cache is None: tokens_cache = {}
+ if book: filters.append(self.index.Q(is_book=True))
- tokens = self.get_tokens(searched, field, cached=tokens_cache)
+ q = self.index.query(**{field: searched})
+ q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
+ res = q.execute()
+ return [SearchResult(found, how_found=u'search_phrase') for found in res]
- query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
- if book:
- filters.append(self.term_filter(Term('is_book', 'true')))
- top = self.searcher.search(query, self.chain_filters(filters), max_results)
-
- return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
-
- def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
- filters=None, tokens_cache=None, boost=None, snippets=True):
+ def search_some(self, searched, fields, book=True,
+ filters=None, snippets=True, query_terms=None):
+ assert isinstance(fields, list)
if filters is None: filters = []
- if tokens_cache is None: tokens_cache = {}
-
- if book:
- filters.append(self.term_filter(Term('is_book', 'true')))
+ if book: filters.append(self.index.Q(is_book=True))
- query = BooleanQuery()
+ query = self.index.Q()
for fld in fields:
- tokens = self.get_tokens(searched, fld, cached=tokens_cache)
+ query = self.index.Q(query | self.make_term_query(searched, fld))
- query.add(BooleanClause(self.make_term_query(tokens, field=fld,
- fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+ query = self.index.query(query)
+ query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
+ res = query.execute()
+ return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
- top = self.searcher.search(query, self.chain_filters(filters), max_results)
- return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
- snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
-
- def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
- """
- Search for perfect book matches. Just see if the query matches with some author or title,
- taking hints into account.
- """
- fields_to_search = ['authors', 'title']
- only_in = None
- if hint:
- if not hint.should_search_for_book():
- return []
- fields_to_search = hint.just_search_in(fields_to_search)
- only_in = hint.book_filter()
-
- qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
- books = []
- for q in qrys:
- top = self.searcher.search(q,
- self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- max_results)
- for found in top.scoreDocs:
- books.append(SearchResult(self, found, how_found="search_perfect_book"))
- return books
-
- def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
- fields_to_search = ['tags', 'authors', 'title']
-
- only_in = None
- if hint:
- if not hint.should_search_for_book():
- return []
- fields_to_search = hint.just_search_in(fields_to_search)
- only_in = hint.book_filter()
-
- tokens = self.get_tokens(searched, field='SIMPLE')
-
- q = BooleanQuery()
-
- for fld in fields_to_search:
- q.add(BooleanClause(self.make_term_query(tokens, field=fld,
- fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
- books = []
- top = self.searcher.search(q,
- self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- max_results)
- for found in top.scoreDocs:
- books.append(SearchResult(self, found, how_found="search_book"))
-
- return books
-
- def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
- """
- Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
- some part/fragment of the book.
- """
- qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
-
- flt = None
- if hint:
- flt = hint.part_filter()
-
- books = []
- for q in qrys:
- top = self.searcher.search(q,
- self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
- flt]),
- max_results)
- for found in top.scoreDocs:
- books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
- return books
-
- def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
+ def search_everywhere(self, searched, query_terms=None):
"""
Tries to use search terms to match different fields of book (or its parts).
E.g. one word can be an author survey, another be a part of the title, and the rest
are some words from third chapter.
"""
- if tokens_cache is None: tokens_cache = {}
books = []
- only_in = None
-
- if hint:
- only_in = hint.part_filter()
-
# content only query : themes x content
- q = BooleanQuery()
-
- tokens_pl = self.get_tokens(searched, field='text', cached=tokens_cache)
- tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
-
- # only search in themes when we do not already filter by themes
- if hint is None or hint.just_search_in(['themes']) != []:
- q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
- fuzzy=fuzzy), BooleanClause.Occur.MUST))
+ q = self.make_term_query(searched, 'text')
+ q_themes = self.make_term_query(searched, 'themes_pl')
- q.add(BooleanClause(self.make_term_query(tokens_pl, field='text',
- fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+ query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
+ res = query.execute()
- topDocs = self.searcher.search(q, only_in, max_results)
- for found in topDocs.scoreDocs:
- books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
+ for found in res:
+ books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
# query themes/content x author/title/tags
- q = BooleanQuery()
- in_content = BooleanQuery()
- in_meta = BooleanQuery()
+ in_content = self.index.Q()
+ in_meta = self.index.Q()
for fld in ['themes_pl', 'text']:
- in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+ in_content |= self.make_term_query(searched, field=fld)
for fld in ['tags', 'authors', 'title']:
- in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+ in_meta |= self.make_term_query(searched, field=fld)
- q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
- q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
+ q = in_content & in_meta
+ res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
- topDocs = self.searcher.search(q, only_in, max_results)
- for found in topDocs.scoreDocs:
- books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
+ for found in res:
+ books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
return books
- # def multisearch(self, query, max_results=50):
- # """
- # Search strategy:
- # - (phrase) OR -> content
- # -> title
- # -> authors
- # - (keywords) -> authors
- # -> motyw
- # -> tags
- # -> content
- # """
- # queryreader = StringReader(query)
- # tokens = self.get_tokens(queryreader)
-
- # top_level = BooleanQuery()
- # Should = BooleanClause.Occur.SHOULD
-
- # phrase_level = BooleanQuery()
- # phrase_level.setBoost(1.3)
-
- # p_content = self.make_phrase(tokens, joined=True)
- # p_title = self.make_phrase(tokens, 'title')
- # p_author = self.make_phrase(tokens, 'author')
-
- # phrase_level.add(BooleanClause(p_content, Should))
- # phrase_level.add(BooleanClause(p_title, Should))
- # phrase_level.add(BooleanClause(p_author, Should))
-
- # kw_level = BooleanQuery()
-
- # kw_level.add(self.make_term_query(tokens, 'author'), Should)
- # j_themes = self.make_term_query(tokens, 'themes', joined=True)
- # kw_level.add(j_themes, Should)
- # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
- # j_con = self.make_term_query(tokens, joined=True)
- # kw_level.add(j_con, Should)
-
- # top_level.add(BooleanClause(phrase_level, Should))
- # top_level.add(BooleanClause(kw_level, Should))
-
- # return None
-
- def get_snippets(self, scoreDoc, query, field='text'):
+ def get_snippets(self, searchresult, query, field='text', num=1):
"""
Returns a snippet for found scoreDoc.
"""
- htmlFormatter = SimpleHTMLFormatter()
- highlighter = Highlighter(htmlFormatter, QueryScorer(query))
-
- stored = self.searcher.doc(scoreDoc.doc)
-
- position = stored.get('snippets_position')
- length = stored.get('snippets_length')
- if position is None or length is None:
- return None
- revision = stored.get('snippets_revision')
- if revision: revision = int(revision)
-
- # locate content.
- book_id = int(stored.get('book_id'))
+ maxnum = len(searchresult)
+ if num is None or num < 0 or num > maxnum:
+ num = maxnum
+ book_id = searchresult.book_id
+ revision = searchresult.snippet_revision()
snippets = Snippets(book_id, revision=revision)
-
+ snips = [None] * maxnum
try:
snippets.open()
+ idx = 0
+ while idx < maxnum and num > 0:
+ position, length = searchresult.snippet_pos(idx)
+ if position is None or length is None:
+ continue
+ text = snippets.get((int(position),
+ int(length)))
+ snip = self.index.highlight(text=text, field=field, q=query)
+ snips[idx] = snip
+ if snip:
+ num -= 1
+ idx += 1
+
except IOError, e:
- log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+ log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
return []
+ finally:
+ snippets.close()
- try:
- try:
- text = snippets.get((int(position),
- int(length)))
- finally:
- snippets.close()
+ # remove verse end markers..
+ snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
- tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
- # highlighter.getBestTextFragments(tokenStream, text, False, 10)
- snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+ searchresult.snippets = snips
- except Exception, e:
- e2 = e
- if hasattr(e, 'getJavaException'):
- e2 = unicode(e.getJavaException())
- raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
- e2)
- return snip
+ return snips
- @staticmethod
- def enum_to_array(enum):
+ def hint_tags(self, query, pdcounter=True, prefix=True):
"""
- Converts a lucene TermEnum to array of Terms, suitable for
- addition to queries
+ Return auto-complete hints for tags
+ using prefix search.
"""
- terms = []
-
- while True:
- t = enum.term()
- if t:
- terms.append(t)
- if not enum.next(): break
+ q = self.index.Q()
+ query = query.strip()
+ for field in ['tag_name', 'tag_name_pl']:
+ if prefix:
+ q |= self.index.Q(**{field: query + "*"})
+ else:
+ q |= self.make_term_query(query, field=field)
+ qu = self.index.query(q).exclude(tag_category="book")
- if terms:
- return JArray('object')(terms, Term)
+ return self.search_tags(qu, pdcounter=pdcounter)
- def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
+ def search_tags(self, query, filters=None, pdcounter=False):
"""
Search for Tag objects using query.
"""
+ if not filters: filters = []
if not pdcounter:
- filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
- tops = self.searcher.search(query, filt, max_results)
+ filters.append(~self.index.Q(is_pdcounter=True))
+ res = self.apply_filters(query, filters).execute()
tags = []
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
- is_pdcounter = doc.get('is_pdcounter')
+ pd_tags = []
+
+ for doc in res:
+ is_pdcounter = doc.get('is_pdcounter', False)
category = doc.get('tag_category')
try:
- if is_pdcounter == 'true':
+ if is_pdcounter == True:
if category == 'pd_author':
tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
elif category == 'pd_book':
tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
tag.category = 'pd_book' # make it look more lik a tag.
else:
- print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
+ print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+ pd_tags.append(tag)
else:
tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
- # don't add the pdcounter tag if same tag already exists
-
- tags.append(tag)
+ tags.append(tag)
except catalogue.models.Tag.DoesNotExist: pass
except PDCounterAuthor.DoesNotExist: pass
except PDCounterBook.DoesNotExist: pass
+ tags_slugs = set(map(lambda t: t.slug, tags))
+ tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
+
log.debug('search_tags: %s' % tags)
return tags
- def search_books(self, query, filt=None, max_results=10):
- """
- Searches for Book objects using query
- """
- bks = []
- tops = self.searcher.search(query, filt, max_results)
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
- try:
- bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
- except catalogue.models.Book.DoesNotExist: pass
- return bks
-
- def make_prefix_phrase(self, toks, field):
- q = MultiPhraseQuery()
- for i in range(len(toks)):
- t = Term(field, toks[i])
- if i == len(toks) - 1:
- pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
- if pterms:
- q.add(pterms)
- else:
- q.add(t)
- else:
- q.add(t)
- return q
-
- @staticmethod
- def term_filter(term, inverse=False):
- only_term = TermsFilter()
- only_term.addTerm(term)
-
- if inverse:
- neg = BooleanFilter()
- neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
- only_term = neg
-
- return only_term
-
- def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
- """
- Return auto-complete hints for tags
- using prefix search.
- """
- toks = self.get_tokens(string, field='SIMPLE')
- top = BooleanQuery()
-
- for field in ['tag_name', 'tag_name_pl']:
- if prefix:
- q = self.make_prefix_phrase(toks, field)
- else:
- q = self.make_term_query(toks, field, fuzzy=fuzzy)
- top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
-
- no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
-
- return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
-
- def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
+ def hint_books(self, query, prefix=True):
"""
Returns auto-complete hints for book titles
Because we do not index 'pseudo' title-tags.
Prefix search.
"""
- toks = self.get_tokens(string, field='SIMPLE')
-
+ q = self.index.Q()
+ query = query.strip()
if prefix:
- q = self.make_prefix_phrase(toks, 'title')
+ q |= self.index.Q(title=query + "*")
else:
- q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
-
- return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
+ q |= self.make_term_query(query, field='title')
+ qu = self.index.query(q)
+ only_books = self.index.Q(is_book=True)
+ return self.search_books(qu, [only_books])
- @staticmethod
- def chain_filters(filters, op='XXXChainedFilter.AND'):
+ def search_books(self, query, filters=None, max_results=10):
"""
- Chains a filter list together
+ Searches for Book objects using query
"""
- filters = filter(lambda x: x is not None, filters)
- if not filters or filters is []:
- return None
- chf = ChainedFilter(JArray('object')(filters, Filter), op)
- return chf
+ bks = []
+ bks_found = set()
+ query = query.query(is_book=True)
+ res = self.apply_filters(query, filters).field_limit(['book_id'])
+ for r in res:
+ try:
+ bid = r['book_id']
+ if not bid in bks_found:
+ bks.append(catalogue.models.Book.objects.get(id=bid))
+ bks_found.add(bid)
+ except catalogue.models.Book.DoesNotExist: pass
+ return bks
+
- def filtered_categories(self, tags):
+ @staticmethod
+ def apply_filters(query, filters):
"""
- Return a list of tag categories, present in tags list.
+ Apply filters to a query
"""
- cats = {}
- for t in tags:
- cats[t.category] = True
- return cats.keys()
-
- def hint(self):
- return Hint(self)
+ if filters is None: filters = []
+ filters = filter(lambda x: x is not None, filters)
+ for f in filters:
+ query = query.query(f)
+ return query