fix handling of spaces in names

[wolnelektury.git] / apps / search / views.py
diff --git a/apps/search/views.py b/apps/search/views.py

index 00391f1..527ca82 100644 (file)
--- a/apps/search/views.py
+++ b/apps/search/views.py
@@ -8,14 +8,14 @@ from django.views.decorators import cache
  from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
  from django.utils.translation import ugettext as _
  
  from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
  from django.utils.translation import ugettext as _
  
-from catalogue.utils import get_random_hash
+from catalogue.utils import split_tags
  from catalogue.models import Book, Tag, Fragment
  from catalogue.fields import dumps
  from catalogue.views import JSONResponse
  from search import Search, JVM, SearchResult
  from lucene import StringReader
  from suggest.forms import PublishingSuggestForm
  from catalogue.models import Book, Tag, Fragment
  from catalogue.fields import dumps
  from catalogue.views import JSONResponse
  from search import Search, JVM, SearchResult
  from lucene import StringReader
  from suggest.forms import PublishingSuggestForm
-
+import re
  import enchant
  
  dictionary = enchant.Dict('pl_PL')
  import enchant
  
  dictionary = enchant.Dict('pl_PL')
@@ -34,10 +34,12 @@ def did_you_mean(query, tokens):
          authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
          if len(authors) > 0:
              continue
          authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
          if len(authors) > 0:
              continue
-        
+
          if not dictionary.check(t):
              try:
          if not dictionary.check(t):
              try:
-                change[t] = dictionary.suggest(t)[0]
+                change_to = dictionary.suggest(t)[0].lower()
+                if change_to != t.lower():
+                    change[t] = change_to
              except IndexError:
                  pass
  
              except IndexError:
                  pass
  
@@ -69,14 +71,17 @@ def hint(request):
      # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
      # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
  
      # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
      # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
  
-    tags = s.hint_tags(prefix)
+    tags = s.hint_tags(prefix, pdcounter=True)
      books = s.hint_books(prefix)
  
      books = s.hint_books(prefix)
  
-    # TODO DODAC TU HINTY
+    def category_name(c):
+        if c.startswith('pd_'):
+            c=c[len('pd_'):]
+        return _(c)
  
      return JSONResponse(
          [{'label': t.name,
  
      return JSONResponse(
          [{'label': t.name,
-          'category': _(t.category),
+          'category': category_name(t.category),
            'id': t.id,
            'url': t.get_absolute_url()}
            for t in tags] + \
            'id': t.id,
            'url': t.get_absolute_url()}
            for t in tags] + \
@@ -94,35 +99,34 @@ def main(request):
  
      results = None
      query = None
  
      results = None
      query = None
-    fuzzy = False
+    fuzzy = False #0.8
  
      if 'q' in request.GET:
  
      if 'q' in request.GET:
-        tags = request.GET.get('tags', '')
+        # tags = request.GET.get('tags', '')
          query = request.GET['q']
          query = request.GET['q']
-        book_id = request.GET.get('book', None)
-        book = None
-        if book_id is not None:
-            book = get_object_or_404(Book, id=book_id)
+        # book_id = request.GET.get('book', None)
+        # book = None
+        # if book_id is not None:
+        #     book = get_object_or_404(Book, id=book_id)
  
  
-        hint = srch.hint()
-        try:
-            tag_list = Tag.get_tag_list(tags)
-        except:
-            tag_list = []
+        # hint = srch.hint()
+        # try:
+        #     tag_list = Tag.get_tag_list(tags)
+        # except:
+        #     tag_list = []
  
          if len(query) < 2:
  
          if len(query) < 2:
-            return render_to_response('catalogue/search_too_short.html', {'tags': tag_list, 'prefix': query},
+            return render_to_response('catalogue/search_too_short.html', {'prefix': query},
                                        context_instance=RequestContext(request))
  
                                        context_instance=RequestContext(request))
  
-        hint.tags(tag_list)
-        if book:
-            hint.books(book)
+        # hint.tags(tag_list)
+        # if book:
+        #     hint.books(book)
+        tags = srch.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
+        tags = split_tags(tags)
  
          toks = StringReader(query)
          tokens_cache = {}
  
          toks = StringReader(query)
          tokens_cache = {}
-        fuzzy = 'fuzzy' in request.GET
-        if fuzzy:
-            fuzzy = 0.7
  
          author_results = srch.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
          title_results = srch.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
  
          author_results = srch.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
          title_results = srch.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
@@ -137,37 +141,65 @@ def main(request):
                  b2.boost *= 1.1
              if bks is []:
                  author_title_rest.append(b)
                  b2.boost *= 1.1
              if bks is []:
                  author_title_rest.append(b)
-        
-        text_phrase = SearchResult.aggregate(srch.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False))
-        
-        everywhere = SearchResult.aggregate(srch.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache), author_title_rest)
+
+        # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
+        # Because the query is using only one field.
+        text_phrase = SearchResult.aggregate(
+            srch.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
+            srch.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
+
+        everywhere = srch.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
+
+        def already_found(results):
+            def f(e):
+                for r in results:
+                    if e.book_id == r.book_id:
+                        e.boost = 0.9
+                        results.append(e)
+                        return True
+                return False
+            return f
+        f = already_found(author_results + title_results + text_phrase)
+        everywhere = filter(lambda x: not f(x), everywhere)
+
+        author_results = SearchResult.aggregate(author_results)
+        title_results = SearchResult.aggregate(title_results)
+
+        everywhere = SearchResult.aggregate(everywhere, author_title_rest)
  
          for res in [author_results, title_results, text_phrase, everywhere]:
              res.sort(reverse=True)
  
          for res in [author_results, title_results, text_phrase, everywhere]:
              res.sort(reverse=True)
+            for r in res:
+                for h in r.hits:
+                    h['snippets'] = map(lambda s:
+                                        re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
+                                                re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
  
          suggestion = did_you_mean(query, srch.get_tokens(toks, field="SIMPLE"))
  
          suggestion = did_you_mean(query, srch.get_tokens(toks, field="SIMPLE"))
+        print "dym? %s" % repr(suggestion).encode('utf-8')
  
          results = author_results + title_results + text_phrase + everywhere
          results.sort(reverse=True)
  
          results = author_results + title_results + text_phrase + everywhere
          results.sort(reverse=True)
-        
+
          if len(results) == 1:
              fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
              if len(fragment_hits) == 1:
          if len(results) == 1:
              fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
              if len(fragment_hits) == 1:
-                anchor = fragment_hits[0]['fragment']
-                frag = Fragment.objects.get(anchor=anchor)
-                return HttpResponseRedirect(frag.get_absolute_url())
+                #anchor = fragment_hits[0]['fragment']
+                #frag = Fragment.objects.get(anchor=anchor)
+                return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
              return HttpResponseRedirect(results[0].book.get_absolute_url())
          elif len(results) == 0:
              form = PublishingSuggestForm(initial={"books": query + ", "})
              return render_to_response('catalogue/search_no_hits.html',
              return HttpResponseRedirect(results[0].book.get_absolute_url())
          elif len(results) == 0:
              form = PublishingSuggestForm(initial={"books": query + ", "})
              return render_to_response('catalogue/search_no_hits.html',
-                                      {'tags': tag_list,
+                                      {'tags': tags,
                                         'prefix': query,
                                         "form": form,
                                         'did_you_mean': suggestion},
                  context_instance=RequestContext(request))
  
                                         'prefix': query,
                                         "form": form,
                                         'did_you_mean': suggestion},
                  context_instance=RequestContext(request))
  
+        print "TAGS: %s" % tags
          return render_to_response('catalogue/search_multiple_hits.html',
          return render_to_response('catalogue/search_multiple_hits.html',
-                                  {'tags': tag_list,
+                                  {'tags': tags,
                                     'prefix': query,
                                     'results': { 'author': author_results,
                                                  'title': title_results,
                                     'prefix': query,
                                     'results': { 'author': author_results,
                                                  'title': title_results,