From 38f324dec64ba8adffcc795095023557b8a7a39c Mon Sep 17 00:00:00 2001 From: Marcin Koziej Date: Wed, 1 Feb 2012 13:06:41 +0100 Subject: [PATCH] fixes on staging --- apps/search/index.py | 32 ++++++----- apps/search/management/commands/reindex.py | 14 +++-- apps/search/management/commands/snippets.py | 60 +++++++++++++++++++++ apps/search/views.py | 17 +++--- wolnelektury/static/css/book_box.css | 2 + 5 files changed, 101 insertions(+), 24 deletions(-) create mode 100755 apps/search/management/commands/snippets.py diff --git a/apps/search/index.py b/apps/search/index.py index 97145d340..8ea31240e 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1212,17 +1212,25 @@ class Search(IndexStore): if position is None or length is None: return None # locate content. - snippets = Snippets(stored.get('book_id')).open() + book_id = int(stored.get('book_id')) + snippets = Snippets(book_id).open() try: - text = snippets.get((int(position), - int(length))) - finally: - snippets.close() + try: + text = snippets.get((int(position), + int(length))) + finally: + snippets.close() - tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer) - # highlighter.getBestTextFragments(tokenStream, text, False, 10) - snip = highlighter.getBestFragments(tokenStream, text, 3, "...") + tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer) + # highlighter.getBestTextFragments(tokenStream, text, False, 10) + snip = highlighter.getBestFragments(tokenStream, text, 3, "...") + except Exception, e: + e2 = e + if hasattr(e, 'getJavaException'): + e2 = unicode(e.getJavaException()) + raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)), + e2) return snip @staticmethod @@ -1302,7 +1310,7 @@ class Search(IndexStore): return only_term - def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True): + def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False): """ Return auto-complete hints for tags using prefix search. @@ -1314,14 +1322,14 @@ class Search(IndexStore): if prefix: q = self.make_prefix_phrase(toks, field) else: - q = self.make_term_query(toks, field) + q = self.make_term_query(toks, field, fuzzy=fuzzy) top.add(BooleanClause(q, BooleanClause.Occur.SHOULD)) no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True) return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter) - def hint_books(self, string, max_results=50, prefix=True): + def hint_books(self, string, max_results=50, prefix=True, fuzzy=False): """ Returns auto-complete hints for book titles Because we do not index 'pseudo' title-tags. @@ -1332,7 +1340,7 @@ class Search(IndexStore): if prefix: q = self.make_prefix_phrase(toks, 'title') else: - q = self.make_term_query(toks, 'title') + q = self.make_term_query(toks, 'title', fuzzy=fuzzy) return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results) diff --git a/apps/search/management/commands/reindex.py b/apps/search/management/commands/reindex.py index bce47080d..890110238 100755 --- a/apps/search/management/commands/reindex.py +++ b/apps/search/management/commands/reindex.py @@ -1,9 +1,14 @@ from django.core.management.base import BaseCommand +from optparse import make_option class Command(BaseCommand): help = 'Reindex everything.' args = '' - + + option_list = BaseCommand.option_list + ( + make_option('-n', '--book-id', action='store_true', dest='book_id', default=False, + help='book id'), + ) def handle(self, *args, **opts): from catalogue.models import Book import search @@ -13,13 +18,16 @@ class Command(BaseCommand): if args: books = [] for a in args: - books += Book.objects.filter(slug=a).all() + if opts['book_id']: + books += Book.objects.filter(id=int(a)).all() + else: + books += Book.objects.filter(slug=a).all() else: books = Book.objects.all() for b in books: print b.title - idx.index_book(b, None) + idx.index_book(b) print 'Reindexing tags.' idx.index_tags() idx.close() diff --git a/apps/search/management/commands/snippets.py b/apps/search/management/commands/snippets.py new file mode 100755 index 000000000..058ea052e --- /dev/null +++ b/apps/search/management/commands/snippets.py @@ -0,0 +1,60 @@ +from django.core.management.base import BaseCommand + +from glob import glob +from optparse import make_option +from os import path +from sys import stdout +from django.conf import settings + +class Command(BaseCommand): + help = 'Reindex everything.' + args = '' + + option_list = BaseCommand.option_list + ( + make_option('-C', '--check-just-read', action='store_true', dest='check', default=False, + help='Check snippets utf-8'), + make_option('-c', '--check', action='store_true', dest='check2', default=False, + help='Check snippets utf-8 by walking through index'), + ) + + + def handle(self, *args, **opts): + from catalogue.models import Book + import search + + if opts['check']: + sfn = glob(settings.SEARCH_INDEX+'snippets/*') + print sfn + for fn in sfn: + print fn + bkid = int(path.basename(fn)) + with open(fn) as f: + cont = f.read() + try: + uc = cont.decode('utf-8') + except UnicodeDecodeError, ude: + print "error in snippets %d" % bkid + if opts['check2']: + s = search.Search() + reader = s.searcher.getIndexReader() + numdocs = reader.numDocs() + for did in range(numdocs): + doc = reader.document(did) + if doc and doc.get('book_id'): + bkid = int(doc.get('book_id')) + #import pdb; pdb.set_trace() + stdout.write("\r%d / %d" % (did, numdocs)) + stdout.flush() + ss = doc.get('snippet_position') + sl = doc.get('snippet_length') + if ss and sl: + snips = Snippets(bkid) + try: + txt = snips.get((ss,sl)) + assert len(txt) == sl + except UnicodeDecodeError, ude: + stdout.write("\nerror in snippets %d\n" % bkid) + raise ude + + stdout.write("\ndone.\n") + diff --git a/apps/search/views.py b/apps/search/views.py index 623b311fb..de2337f8d 100644 --- a/apps/search/views.py +++ b/apps/search/views.py @@ -37,7 +37,9 @@ def did_you_mean(query, tokens): if not dictionary.check(t): try: - change[t] = dictionary.suggest(t)[0] + change_to = dictionary.suggest(t)[0].lower() + if change_to != t.lower(): + change[t] = change_to except IndexError: pass @@ -94,7 +96,7 @@ def main(request): results = None query = None - fuzzy = False + fuzzy = False #0.8 if 'q' in request.GET: # tags = request.GET.get('tags', '') @@ -117,14 +119,11 @@ def main(request): # hint.tags(tag_list) # if book: # hint.books(book) - tags = srch.hint_tags(query, pdcounter=True, prefix=False) + tags = srch.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy) tags = split_tags(tags) toks = StringReader(query) tokens_cache = {} - fuzzy = 'fuzzy' in request.GET - if fuzzy: - fuzzy = 0.7 author_results = srch.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache) title_results = srch.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache) @@ -182,9 +181,9 @@ def main(request): if len(results) == 1: fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits) if len(fragment_hits) == 1: - anchor = fragment_hits[0]['fragment'] - frag = Fragment.objects.get(anchor=anchor) - return HttpResponseRedirect(frag.get_absolute_url()) + #anchor = fragment_hits[0]['fragment'] + #frag = Fragment.objects.get(anchor=anchor) + return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url()) return HttpResponseRedirect(results[0].book.get_absolute_url()) elif len(results) == 0: form = PublishingSuggestForm(initial={"books": query + ", "}) diff --git a/wolnelektury/static/css/book_box.css b/wolnelektury/static/css/book_box.css index abf07de4f..876bf06b0 100755 --- a/wolnelektury/static/css/book_box.css +++ b/wolnelektury/static/css/book_box.css @@ -88,10 +88,12 @@ * ingenous float containment hack * http://www.mikepadgett.com/technology/technical/alternative-to-the-pie-clearfix-hack/ */ +/* .search-result .book-box-inner { height: 1%; overflow: hidden; } +*/ .book-mini-box img, .book-box img, .book-wide-box img, .search-result img { width: 13.9em; -- 2.20.1