From: Marcin Koziej Date: Wed, 24 Oct 2012 14:49:54 +0000 (+0200) Subject: search changes X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/35b64fd8bec183054b63234aebf8782b87cf5cc5?hp=d157af1061e9f03f59ea909d7d25f4a0b41f1c0e search changes --- diff --git a/apps/search/custom.py b/apps/search/custom.py index 86d387e02..e6f559b62 100644 --- a/apps/search/custom.py +++ b/apps/search/custom.py @@ -6,6 +6,8 @@ import warnings from sunburnt import search import copy from httplib2 import socket +import re + class TermVectorOptions(search.Options): def __init__(self, schema, original=None): @@ -93,7 +95,6 @@ class CustomSolrInterface(sunburnt.SolrInterface): self.init_schema() except socket.error, e: raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e) - def _analyze(self, **kwargs): if not self.readable: @@ -134,6 +135,25 @@ class CustomSolrInterface(sunburnt.SolrInterface): terms = map(lambda n: unicode(n.text), terms) return terms + def expand_margins(self, text, start, end): + totlen = len(text) + + def is_boundary(x): + ws = re.compile(r"\W", re.UNICODE) + return bool(ws.match(x)) + + while start > 0: + if is_boundary(text[start - 1]): + break + start -= 1 + + while end < totlen: + if is_boundary(text[end + 1]): + break + end += 1 + + return (start, end) + def substring(self, text, matches, margins=30, mark=("", "")): start = None end = None @@ -142,23 +162,28 @@ class CustomSolrInterface(sunburnt.SolrInterface): ((s, e), (max(0, s - margins), min(totlen, e + margins))), matches) + matches_margins = map(lambda (m, (s, e)): + (m, self.expand_margins(text, s, e)), + matches_margins) + + # lets start with first match (start, end) = matches_margins[0][1] - matches = [] + matches = [matches_margins[0][0]] + for (m, (s, e)) in matches_margins[1:]: if end < s or start > e: continue start = min(start, s) end = max(end, e) matches.append(m) - + snip = text[start:end] matches.sort(lambda a, b: cmp(b[0], a[0])) + print matches for (s, e) in matches: off = - start snip = snip[:e + off] + mark[1] + snip[e + off:] snip = snip[:s + off] + mark[0] + snip[s + off:] - # maybe break on word boundaries return snip - diff --git a/apps/search/index.py b/apps/search/index.py index 26da06220..ea1a6c581 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -548,7 +548,7 @@ class SearchResult(object): header_span = header_span is not None and int(header_span) or 1 fragment = doc.get("fragment_anchor", None) snippets_pos = (doc['snippets_position'], doc['snippets_length']) - snippets_rev = doc['snippets_revision'] + snippets_rev = doc.get('snippets_revision', None) hit = (sec + (header_span,), fragment, self._score, { 'how_found': how_found, @@ -561,7 +561,7 @@ class SearchResult(object): self._hits.append(hit) def __unicode__(self): - return u"" % \ (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets)) def __str__(self): @@ -725,68 +725,6 @@ class Search(SolrIndex): def __init__(self, default_field="text"): super(Search, self).__init__(mode='r') - # def get_tokens(self, searched, field='text', cached=None): - # """returns tokens analyzed by a proper (for a field) analyzer - # argument can be: StringReader, string/unicode, or tokens. In the last case - # they will just be returned (so we can reuse tokens, if we don't change the analyzer) - # """ - # if cached is not None and field in cached: - # return cached[field] - - # if isinstance(searched, str) or isinstance(searched, unicode): - # searched = StringReader(searched) - # elif isinstance(searched, list): - # return searched - - # searched.reset() - # tokens = self.analyzer.reusableTokenStream(field, searched) - # toks = [] - # while tokens.incrementToken(): - # cta = tokens.getAttribute(CharTermAttribute.class_) - # toks.append(cta.toString()) - - # if cached is not None: - # cached[field] = toks - - # return toks - - # @staticmethod - # def fuzziness(fuzzy): - # """Helper method to sanitize fuzziness""" - # if not fuzzy: - # return None - # if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0: - # return fuzzy - # else: - # return 0.5 - - # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False): - # """ - # Return a PhraseQuery with a series of tokens. - # """ - # if fuzzy: - # phrase = MultiPhraseQuery() - # for t in tokens: - # term = Term(field, t) - # fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy)) - # fuzzterms = [] - - # while True: - # ft = fuzzterm.term() - # if ft: - # fuzzterms.append(ft) - # if not fuzzterm.next(): break - # if fuzzterms: - # phrase.add(JArray('object')(fuzzterms, Term)) - # else: - # phrase.add(term) - # else: - # phrase = PhraseQuery() - # phrase.setSlop(slop) - # for t in tokens: - # term = Term(field, t) - # phrase.add(term) - # return phrase def make_term_query(self, query, field='text', modal=operator.or_): """ @@ -828,78 +766,6 @@ class Search(SolrIndex): res = query.execute() return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res] - # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None): - # """ - # Search for perfect book matches. Just see if the query matches with some author or title, - # taking hints into account. - # """ - # fields_to_search = ['authors', 'title'] - # only_in = None - # if hint: - # if not hint.should_search_for_book(): - # return [] - # fields_to_search = hint.just_search_in(fields_to_search) - # only_in = hint.book_filter() - - # qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search] - - # books = [] - # for q in qrys: - # top = self.searcher.search(q, - # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]), - # max_results) - # for found in top.scoreDocs: - # books.append(SearchResult(self, found, how_found="search_perfect_book")) - # return books - - # def search_book(self, searched, max_results=20, fuzzy=False, hint=None): - # fields_to_search = ['tags', 'authors', 'title'] - - # only_in = None - # if hint: - # if not hint.should_search_for_book(): - # return [] - # fields_to_search = hint.just_search_in(fields_to_search) - # only_in = hint.book_filter() - - # tokens = self.get_tokens(searched, field='SIMPLE') - - # q = BooleanQuery() - - # for fld in fields_to_search: - # q.add(BooleanClause(self.make_term_query(tokens, field=fld, - # fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) - - # books = [] - # top = self.searcher.search(q, - # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]), - # max_results) - # for found in top.scoreDocs: - # books.append(SearchResult(self, found, how_found="search_book")) - - # return books - - # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None): - # """ - # Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase()) - # some part/fragment of the book. - # """ - # qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']] - - # flt = None - # if hint: - # flt = hint.part_filter() - - # books = [] - # for q in qrys: - # top = self.searcher.search(q, - # self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True), - # flt]), - # max_results) - # for found in top.scoreDocs: - # books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts')) - - # return books def search_everywhere(self, searched, query_terms=None): """ @@ -1055,33 +921,6 @@ class Search(SolrIndex): except catalogue.models.Book.DoesNotExist: pass return bks - # def make_prefix_phrase(self, toks, field): - # q = MultiPhraseQuery() - # for i in range(len(toks)): - # t = Term(field, toks[i]) - # if i == len(toks) - 1: - # pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t)) - # if pterms: - # q.add(pterms) - # else: - # q.add(t) - # else: - # q.add(t) - # return q - - # @staticmethod - # def term_filter(term, inverse=False): - # only_term = TermsFilter() - # only_term.addTerm(term) - - # if inverse: - # neg = BooleanFilter() - # neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT)) - # only_term = neg - - # return only_term - - @staticmethod def apply_filters(query, filters): @@ -1093,15 +932,3 @@ class Search(SolrIndex): for f in filters: query = query.query(f) return query - - # def filtered_categories(self, tags): - # """ - # Return a list of tag categories, present in tags list. - # """ - # cats = {} - # for t in tags: - # cats[t.category] = True - # return cats.keys() - - # def hint(self): - # return Hint(self) diff --git a/apps/search/management/commands/checkindex.py b/apps/search/management/commands/checkindex.py deleted file mode 100644 index b910277de..000000000 --- a/apps/search/management/commands/checkindex.py +++ /dev/null @@ -1,22 +0,0 @@ - -from django.core.management.base import BaseCommand -from search import IndexChecker - -class Command(BaseCommand): - help = 'Check Lucene search index' - args = '' - - def handle(self, *args, **opts): - checker = IndexChecker() - status = checker.check() - if status.clean: - print "No problems found." - else: - if status.missingSegments: - print "Unable to locate." - print "Number of bad segments: %d / %d (max segment name is %d)" % \ - (status.numBadSegments, status.numSegments, status.maxSegmentName) - print "Total lost documents (due to bad segments) %d" % status.totLoseDocCount - if not status.validCounter: - print "Segment counter is not valid." - diff --git a/apps/search/tests/files/fraszka-do-anusie.xml b/apps/search/tests/files/fraszka-do-anusie.xml deleted file mode 100755 index 3bbda155e..000000000 --- a/apps/search/tests/files/fraszka-do-anusie.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - - -Sęp Szarzyński, Mikołaj -Fraszka do Anusie -Sekuła, Aleksandra -Sutkowska, Olga -Fundacja Nowoczesna Polska -Barok -Liryka -Fraszka -Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. -http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie -http://www.polona.pl/Content/8759 -Szarzyński Sęp, Mikołaj (ca 1550-1581), Rytmy abo Wiersze polskie w wyborze, E. Wende, Warszawa, 1914 -Domena publiczna - Mikołaj Sęp Szarzyński zm. 1581 -1581 -xml -text -text -2008-12-29 -L -L -pol - - - - -Mikołaj Sęp Szarzyński - -Fraszka do Anusie - - - -Kochanek, Łzy, Miłość, Oko, Serce, WzrokJeśli oczu hamować swoich nie umiały/ -Leśnych krynic boginie, aby nie płakały,/ -Gdy baczyłybaczyły --- tu: zobaczyły, patrzyły na. przy studni Narcyza pięknego,/ -A on umarł prze miłość oblicza swojego;/ -Jeśli nieśmiertelnym stanom żałość rozkazuje,/ -Gdy niebaczna fortuna co niesłusznie psuje: - -Jakoż ja mam hamować, by na lice moje/ -Z oczu smutnych żałośne nie płynęły zdroje?/ -Jako serce powściągać, aby nie wzdychało/ -I od ciężkiej żałości omdlewać nie miało? - - - diff --git a/apps/search/tests/files/fraszki.xml b/apps/search/tests/files/fraszki.xml deleted file mode 100755 index edb29abbc..000000000 --- a/apps/search/tests/files/fraszki.xml +++ /dev/null @@ -1,27 +0,0 @@ - - - - -Kochanowski, Jan -Fraszki -http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie - -Fundacja Nowoczesna Polska -Renesans -Liryka -Fraszka - - -http://wolnelektury.pl/lektura/fraszki - -Domena publiczna - Jan Kochanowski zm. 1584 -1584 -xml -text - -text -2008-11-12 -pol - - - diff --git a/apps/search/tests/index.py b/apps/search/tests/index.py index 5155a84e4..12f6f7d1c 100644 --- a/apps/search/tests/index.py +++ b/apps/search/tests/index.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- from django.conf import settings from django.test.utils import override_settings -from catalogue.test_utils import WLTestCase -from lucene import PolishAnalyzer, Version +from catalogue.test_utils import WLTestCase, get_fixture from os import path import tempfile -from catalogue import models -from search import Search, SearchResult +from catalogue.models import Book, Tag +from search import Index, Search, SearchResult +import catalogue +import opds @override_settings( @@ -16,23 +17,30 @@ class BookSearchTests(WLTestCase): def setUp(self): WLTestCase.setUp(self) - txt = path.join(path.dirname(__file__), 'files/fraszka-do-anusie.xml') + index = Index() + index.index.delete_all() + index.index.commit() + with self.settings(NO_SEARCH_INDEX=False): - self.book = models.Book.from_xml_file(txt) + self.do_doktora = Book.from_xml_file( + get_fixture('do-doktora.xml', opds)) + self.do_anusie = Book.from_xml_file( + get_fixture('fraszka-do-anusie.xml', catalogue)) + self.search = Search() def test_search_perfect_book_author(self): - books = self.search.search_perfect_book("sęp szarzyński") + books = self.search.search_books(self.search.index.query(authors=u"sęp szarzyński")) assert len(books) == 1 assert books[0].book_id == self.book.id def test_search_perfect_book_title(self): - books = self.search.search_perfect_book("fraszka anusie") + books = self.search.search_books(self.search.index.query(u"fraszka anusie")) assert len(books) == 1 assert books[0].book_id == self.book.id def test_search_perfect_parts(self): - books = self.search.search_perfect_parts("Jakoż hamować") + books = self.search.search_phrase(u"Jakoż hamować") assert len(books) == 2 for b in books: b.book_id == self.book.id @@ -41,28 +49,7 @@ class BookSearchTests(WLTestCase): assert len(a[0].hits) == 1 def test_search_perfect_author_title(self): - books = self.search.search_perfect_book("szarzyński anusie") + books = self.search.search_books(self.search.index.query(authors=u"szarzyński anusie")) assert books == [] - books = self.search.search_book("szarzyński anusie") - assert len(books) == 1 - - books = self.search.search_book("szarzyński fraszka") - assert len(books) == 1 - - def test_search_everywhere(self): - books = self.search.search_everywhere("szarzyński kochanek") - print 'szarzyński kochanek %s' % [b.hits for b in books] - - books = self.search.search_everywhere("szarzyński narcyz") - print 'szarzyński narcyz %s' % [b.hits for b in books] - - books = self.search.search_everywhere("anusie narcyz") - print 'anusie narcyz %s' % [b.hits for b in books] - - # theme content cross - books = self.search.search_everywhere("wzrok boginie") - print 'wzrok boginie %s' % [b.hits for b in books] - - books = self.search.search_everywhere("anusie płynęły zdroje") - print 'anusie płynęły zdroje %s' % [b.hits for b in books] + diff --git a/apps/search/views.py b/apps/search/views.py index 72852d0d4..36dd52cd2 100644 --- a/apps/search/views.py +++ b/apps/search/views.py @@ -188,14 +188,15 @@ def main(request): # ensure books do exists & sort them results.sort(reverse=True) - if len(results) == 1: - fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits) - if len(fragment_hits) == 1: - #anchor = fragment_hits[0]['fragment'] - #frag = Fragment.objects.get(anchor=anchor) - return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url()) - return HttpResponseRedirect(results[0].book.get_absolute_url()) - elif len(results) == 0: + # We don't want to redirect to book text, but rather display result page even with one result. + # if len(results) == 1: + # fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits) + # if len(fragment_hits) == 1: + # #anchor = fragment_hits[0]['fragment'] + # #frag = Fragment.objects.get(anchor=anchor) + # return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url()) + # return HttpResponseRedirect(results[0].book.get_absolute_url()) + if len(results) == 0: form = PublishingSuggestForm(initial={"books": query + ", "}) return render_to_response('catalogue/search_no_hits.html', {'tags': tags,