search too many files fix?
[wolnelektury.git] / apps / search / views.py
index e9b2564..4db4bb8 100644 (file)
@@ -8,14 +8,14 @@ from django.views.decorators import cache
 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
 from django.utils.translation import ugettext as _
 
 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
 from django.utils.translation import ugettext as _
 
-from catalogue.utils import get_random_hash
+from catalogue.utils import split_tags
 from catalogue.models import Book, Tag, Fragment
 from catalogue.fields import dumps
 from catalogue.views import JSONResponse
 from search import Search, JVM, SearchResult
 from lucene import StringReader
 from suggest.forms import PublishingSuggestForm
 from catalogue.models import Book, Tag, Fragment
 from catalogue.fields import dumps
 from catalogue.views import JSONResponse
 from search import Search, JVM, SearchResult
 from lucene import StringReader
 from suggest.forms import PublishingSuggestForm
-
+import re
 import enchant
 
 dictionary = enchant.Dict('pl_PL')
 import enchant
 
 dictionary = enchant.Dict('pl_PL')
@@ -34,10 +34,12 @@ def did_you_mean(query, tokens):
         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
         if len(authors) > 0:
             continue
         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
         if len(authors) > 0:
             continue
-        
+
         if not dictionary.check(t):
             try:
         if not dictionary.check(t):
             try:
-                change[t] = dictionary.suggest(t)[0]
+                change_to = dictionary.suggest(t)[0].lower()
+                if change_to != t.lower():
+                    change[t] = change_to
             except IndexError:
                 pass
 
             except IndexError:
                 pass
 
@@ -49,15 +51,15 @@ def did_you_mean(query, tokens):
 
     return query
 
 
     return query
 
+search = Search()
 
 def hint(request):
     prefix = request.GET.get('term', '')
     if len(prefix) < 2:
         return JSONResponse([])
     JVM.attachCurrentThread()
 
 def hint(request):
     prefix = request.GET.get('term', '')
     if len(prefix) < 2:
         return JSONResponse([])
     JVM.attachCurrentThread()
-    s = Search()
 
 
-    hint = s.hint()
+    hint = search.hint()
     try:
         tags = request.GET.get('tags', '')
         hint.tags(Tag.get_tag_list(tags))
     try:
         tags = request.GET.get('tags', '')
         hint.tags(Tag.get_tag_list(tags))
@@ -69,14 +71,17 @@ def hint(request):
     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
 
     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
 
-    tags = s.hint_tags(prefix)
-    books = s.hint_books(prefix)
+    tags = search.hint_tags(prefix, pdcounter=True)
+    books = search.hint_books(prefix)
 
 
-    # TODO DODAC TU HINTY
+    def category_name(c):
+        if c.startswith('pd_'):
+            c=c[len('pd_'):]
+        return _(c)
 
     return JSONResponse(
         [{'label': t.name,
 
     return JSONResponse(
         [{'label': t.name,
-          'category': _(t.category),
+          'category': category_name(t.category),
           'id': t.id,
           'url': t.get_absolute_url()}
           for t in tags] + \
           'id': t.id,
           'url': t.get_absolute_url()}
           for t in tags] + \
@@ -90,46 +95,44 @@ def hint(request):
 def main(request):
     results = {}
     JVM.attachCurrentThread()  # where to put this?
 def main(request):
     results = {}
     JVM.attachCurrentThread()  # where to put this?
-    srch = Search()
 
     results = None
     query = None
 
     results = None
     query = None
-    fuzzy = False
+    fuzzy = False #0.8
 
     if 'q' in request.GET:
 
     if 'q' in request.GET:
-        tags = request.GET.get('tags', '')
+        tags = request.GET.get('tags', '')
         query = request.GET['q']
         query = request.GET['q']
-        book_id = request.GET.get('book', None)
-        book = None
-        if book_id is not None:
-            book = get_object_or_404(Book, id=book_id)
+        book_id = request.GET.get('book', None)
+        book = None
+        if book_id is not None:
+            book = get_object_or_404(Book, id=book_id)
 
 
-        hint = srch.hint()
-        try:
-            tag_list = Tag.get_tag_list(tags)
-        except:
-            tag_list = []
+        # hint = search.hint()
+        try:
+            tag_list = Tag.get_tag_list(tags)
+        except:
+            tag_list = []
 
         if len(query) < 2:
 
         if len(query) < 2:
-            return render_to_response('catalogue/search_too_short.html', {'tags': tag_list, 'prefix': query},
+            return render_to_response('catalogue/search_too_short.html', {'prefix': query},
                                       context_instance=RequestContext(request))
 
                                       context_instance=RequestContext(request))
 
-        hint.tags(tag_list)
-        if book:
-            hint.books(book)
+        # hint.tags(tag_list)
+        # if book:
+        #     hint.books(book)
+        tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
+        tags = split_tags(tags)
 
         toks = StringReader(query)
         tokens_cache = {}
 
         toks = StringReader(query)
         tokens_cache = {}
-        fuzzy = 'fuzzy' in request.GET
-        if fuzzy:
-            fuzzy = 0.7
 
 
-        author_results = srch.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
-        title_results = srch.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
+        author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
+        title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
 
         # Boost main author/title results with mixed search, and save some of its results for end of list.
         # boost author, title results
 
         # Boost main author/title results with mixed search, and save some of its results for end of list.
         # boost author, title results
-        author_title_mixed = srch.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
+        author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
         author_title_rest = []
         for b in author_title_mixed:
             bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
         author_title_rest = []
         for b in author_title_mixed:
             bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
@@ -137,38 +140,65 @@ def main(request):
                 b2.boost *= 1.1
             if bks is []:
                 author_title_rest.append(b)
                 b2.boost *= 1.1
             if bks is []:
                 author_title_rest.append(b)
-        
-        text_phrase = SearchResult.aggregate(srch.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache))
-        [r.process_hits() for r in text_phrase]
-        
-        everywhere = SearchResult.aggregate(srch.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache), author_title_rest)
-        [r.process_hits() for r in everywhere]
+
+        # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
+        # Because the query is using only one field.
+        text_phrase = SearchResult.aggregate(
+            search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
+            search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
+
+        everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
+
+        def already_found(results):
+            def f(e):
+                for r in results:
+                    if e.book_id == r.book_id:
+                        e.boost = 0.9
+                        results.append(e)
+                        return True
+                return False
+            return f
+        f = already_found(author_results + title_results + text_phrase)
+        everywhere = filter(lambda x: not f(x), everywhere)
+
+        author_results = SearchResult.aggregate(author_results)
+        title_results = SearchResult.aggregate(title_results)
+
+        everywhere = SearchResult.aggregate(everywhere, author_title_rest)
 
         for res in [author_results, title_results, text_phrase, everywhere]:
             res.sort(reverse=True)
 
         for res in [author_results, title_results, text_phrase, everywhere]:
             res.sort(reverse=True)
+            for r in res:
+                for h in r.hits:
+                    h['snippets'] = map(lambda s:
+                                        re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
+                                                re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
 
 
-        suggestion = did_you_mean(query, srch.get_tokens(toks, field="SIMPLE"))
+        suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
+        print "dym? %s" % repr(suggestion).encode('utf-8')
 
         results = author_results + title_results + text_phrase + everywhere
         results.sort(reverse=True)
 
         results = author_results + title_results + text_phrase + everywhere
         results.sort(reverse=True)
-        
+
         if len(results) == 1:
         if len(results) == 1:
-            if len(results[0].hits) == 0:
-                return HttpResponseRedirect(results[0].book.get_absolute_url())
-            elif len(results[0].hits) == 1 and results[0].hits[0] is not None:
-                frag = Fragment.objects.get(anchor=results[0].hits[0])
-                return HttpResponseRedirect(frag.get_absolute_url())
+            fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
+            if len(fragment_hits) == 1:
+                #anchor = fragment_hits[0]['fragment']
+                #frag = Fragment.objects.get(anchor=anchor)
+                return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
+            return HttpResponseRedirect(results[0].book.get_absolute_url())
         elif len(results) == 0:
             form = PublishingSuggestForm(initial={"books": query + ", "})
             return render_to_response('catalogue/search_no_hits.html',
         elif len(results) == 0:
             form = PublishingSuggestForm(initial={"books": query + ", "})
             return render_to_response('catalogue/search_no_hits.html',
-                                      {'tags': tag_list,
+                                      {'tags': tags,
                                        'prefix': query,
                                        "form": form,
                                        'did_you_mean': suggestion},
                 context_instance=RequestContext(request))
 
                                        'prefix': query,
                                        "form": form,
                                        'did_you_mean': suggestion},
                 context_instance=RequestContext(request))
 
+        print "TAGS: %s" % tags
         return render_to_response('catalogue/search_multiple_hits.html',
         return render_to_response('catalogue/search_multiple_hits.html',
-                                  {'tags': tag_list,
+                                  {'tags': tags,
                                    'prefix': query,
                                    'results': { 'author': author_results,
                                                 'title': title_results,
                                    'prefix': query,
                                    'results': { 'author': author_results,
                                                 'title': title_results,