footnotes

[wolnelektury.git] / apps / search / views.py
diff --git a/apps/search/views.py b/apps/search/views.py

index e73b92a..cf00870 100644 (file)
--- a/apps/search/views.py
+++ b/apps/search/views.py
@@ -1,5 +1,6 @@
  # -*- coding: utf-8 -*-
  
  # -*- coding: utf-8 -*-
  
+from django.conf import settings
  from django.shortcuts import render_to_response, get_object_or_404
  from django.template import RequestContext
  from django.contrib.auth.decorators import login_required
  from django.shortcuts import render_to_response, get_object_or_404
  from django.template import RequestContext
  from django.contrib.auth.decorators import login_required
@@ -8,24 +9,32 @@ from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpRespons
  from django.utils.translation import ugettext as _
  
  from catalogue.utils import get_random_hash
  from django.utils.translation import ugettext as _
  
  from catalogue.utils import get_random_hash
-from catalogue.models import Book, Tag, Fragment, TAG_CATEGORIES
+from catalogue.models import Book, Tag, Fragment
  from catalogue.fields import dumps
  from catalogue.views import JSONResponse
  from catalogue.fields import dumps
  from catalogue.views import JSONResponse
-from catalogue import forms
  from search import Search, JVM, SearchResult
  from lucene import StringReader
  from suggest.forms import PublishingSuggestForm
  from search import Search, JVM, SearchResult
  from lucene import StringReader
  from suggest.forms import PublishingSuggestForm
-
+import re
  import enchant
  
  dictionary = enchant.Dict('pl_PL')
  
  
  import enchant
  
  dictionary = enchant.Dict('pl_PL')
  
  
+def match_word_re(word):
+    if 'sqlite' in settings.DATABASES['default']['ENGINE']:
+        return r"\b%s\b" % word
+    elif 'mysql' in settings.DATABASES['default']['ENGINE']:
+        return "[[:<:]]%s[[:>:]]" % word
+
+
  def did_you_mean(query, tokens):
      change = {}
  def did_you_mean(query, tokens):
      change = {}
-    # sprawdzić, czy słowo nie jest aby autorem - proste szukanie termu w author!
      for t in tokens:
      for t in tokens:
-        print("%s ok? %s, sug: %s" %(t, dictionary.check(t), dictionary.suggest(t)))
+        authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
+        if len(authors) > 0:
+            continue
+        
          if not dictionary.check(t):
              try:
                  change[t] = dictionary.suggest(t)[0]
          if not dictionary.check(t):
              try:
                  change[t] = dictionary.suggest(t)[0]
@@ -60,7 +69,6 @@ def hint(request):
      # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
      # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
  
      # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
      # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
  
-    
      tags = s.hint_tags(prefix)
      books = s.hint_books(prefix)
  
      tags = s.hint_tags(prefix)
      books = s.hint_books(prefix)
  
@@ -79,18 +87,6 @@ def hint(request):
              for b in books])
  
  
              for b in books])
  
  
-def foo(s, q, tag_list=None):
-    hint = s.hint()
-    try:
-        tag_list = Tag.get_tag_list(tag_list)
-        hint.tags(tag_list)
-    except:
-        tag_list = None
-
-    q = StringReader(q)
-    return (q, hint)
-
-
  def main(request):
      results = {}
      JVM.attachCurrentThread()  # where to put this?
  def main(request):
      results = {}
      JVM.attachCurrentThread()  # where to put this?
@@ -123,45 +119,86 @@ def main(request):
              hint.books(book)
  
          toks = StringReader(query)
              hint.books(book)
  
          toks = StringReader(query)
+        tokens_cache = {}
          fuzzy = 'fuzzy' in request.GET
          if fuzzy:
              fuzzy = 0.7
  
          fuzzy = 'fuzzy' in request.GET
          if fuzzy:
              fuzzy = 0.7
  
-        results = SearchResult.aggregate(srch.search_perfect_book(toks, fuzzy=fuzzy, hint=hint),
-                                         srch.search_book(toks, fuzzy=fuzzy, hint=hint),
-                                         srch.search_perfect_parts(toks, fuzzy=fuzzy, hint=hint),
-                                         srch.search_everywhere(toks, fuzzy=fuzzy, hint=hint))
-
-        for r in results:
-            r.process_hits()
-
+        author_results = srch.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
+        title_results = srch.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
+
+        # Boost main author/title results with mixed search, and save some of its results for end of list.
+        # boost author, title results
+        author_title_mixed = srch.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
+        author_title_rest = []
+        for b in author_title_mixed:
+            bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
+            for b2 in bks:
+                b2.boost *= 1.1
+            if bks is []:
+                author_title_rest.append(b)
+
+        # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
+        # Because the query is using only one field.
+        text_phrase = SearchResult.aggregate(
+            srch.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
+            srch.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
+
+        everywhere = srch.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
+
+        def already_found(results):
+            def f(e):
+                for r in results:
+                    if e.book_id == r.book_id:
+                        e.boost = 0.9
+                        results.append(e)
+                        return True
+                return False
+            return f
+        f = already_found(author_results + title_results + text_phrase)
+        everywhere = filter(lambda x: not f(x), everywhere)
+
+        author_results = SearchResult.aggregate(author_results)
+        title_results = SearchResult.aggregate(title_results)
+        
+        everywhere = SearchResult.aggregate(everywhere, author_title_rest)
+
+        for res in [author_results, title_results, text_phrase, everywhere]:
+            res.sort(reverse=True)
+            for r in res:
+                for h in r.hits:
+                    h['snippets'] = map(lambda s:
+                                        re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"", 
+                                                re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
+                    
+        suggestion = did_you_mean(query, srch.get_tokens(toks, field="SIMPLE"))
+        print "dym? %s" % repr(suggestion).encode('utf-8')
+        
+        results = author_results + title_results + text_phrase + everywhere
          results.sort(reverse=True)
          results.sort(reverse=True)
-
-        for r in results:
-            print "-----"
-            for h in r.hits:
-                print "- %s" % h
-
+        
          if len(results) == 1:
          if len(results) == 1:
-            if len(results[0].hits) == 0:
-                return HttpResponseRedirect(results[0].book.get_absolute_url())
-            elif len(results[0].hits) == 1 and results[0].hits[0] is not None:
-                frag = Fragment.objects.get(anchor=results[0].hits[0])
+            fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
+            if len(fragment_hits) == 1:
+                anchor = fragment_hits[0]['fragment']
+                frag = Fragment.objects.get(anchor=anchor)
                  return HttpResponseRedirect(frag.get_absolute_url())
                  return HttpResponseRedirect(frag.get_absolute_url())
+            return HttpResponseRedirect(results[0].book.get_absolute_url())
          elif len(results) == 0:
              form = PublishingSuggestForm(initial={"books": query + ", "})
              return render_to_response('catalogue/search_no_hits.html',
          elif len(results) == 0:
              form = PublishingSuggestForm(initial={"books": query + ", "})
              return render_to_response('catalogue/search_no_hits.html',
-                                      {'tags': tag_list, 'prefix': query, "pubsuggest_form": form,
-                                       'form': forms.SearchForm()},
+                                      {'tags': tag_list,
+                                       'prefix': query,
+                                       "form": form,
+                                       'did_you_mean': suggestion},
                  context_instance=RequestContext(request))
  
          return render_to_response('catalogue/search_multiple_hits.html',
                  context_instance=RequestContext(request))
  
          return render_to_response('catalogue/search_multiple_hits.html',
-                                  {'tags': tag_list, 'prefix': query,
-                                   'results': results, 'from': forms.SearchForm()},
+                                  {'tags': tag_list,
+                                   'prefix': query,
+                                   'results': { 'author': author_results,
+                                                'title': title_results,
+                                                'content': text_phrase,
+                                                'other': everywhere},
+                                   'did_you_mean': suggestion},
              context_instance=RequestContext(request))
              context_instance=RequestContext(request))
-
-    # return render_to_response('newsearch/search.html', {'results': results,
-    #                                                     'did_you_mean': (query is not None) and
-    #                                                     did_you_mean(query, srch.get_tokens(query, field='SIMPLE')),
-    #                                                     'fuzzy': fuzzy},
-    #                           context_instance=RequestContext(request))