search changes

[wolnelektury.git] / apps / search / views.py
diff --git a/apps/search/views.py b/apps/search/views.py

index fd5883e..36dd52c 100644 (file)
--- a/apps/search/views.py
+++ b/apps/search/views.py
@@ -10,15 +10,15 @@ from django.utils.translation import ugettext as _
  
  from catalogue.utils import split_tags
  from catalogue.models import Book, Tag, Fragment
  
  from catalogue.utils import split_tags
  from catalogue.models import Book, Tag, Fragment
+from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from catalogue.views import JSONResponse
  from catalogue.views import JSONResponse
-from search import Search, JVM, SearchResult
+from search import Search, SearchResult
  from lucene import StringReader
  from suggest.forms import PublishingSuggestForm
  from time import sleep
  import re
  from lucene import StringReader
  from suggest.forms import PublishingSuggestForm
  from time import sleep
  import re
-import enchant
-
-dictionary = enchant.Dict('pl_PL')
+#import enchant
+import json
  
  
  def match_word_re(word):
  
  
  def match_word_re(word):
@@ -29,59 +29,37 @@ def match_word_re(word):
  
  
  def did_you_mean(query, tokens):
  
  
  def did_you_mean(query, tokens):
-    change = {}
-    for t in tokens:
-        authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
-        if len(authors) > 0:
-            continue
-
-        if not dictionary.check(t):
-            try:
-                change_to = dictionary.suggest(t)[0].lower()
-                if change_to != t.lower():
-                    change[t] = change_to
-            except IndexError:
-                pass
-
-    if change == {}:
-        return None
-
-    for frm, to in change.items():
-        query = query.replace(frm, to)
-
      return query
      return query
+    # change = {}
+    # for t in tokens:
+    #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
+    #     if len(authors) > 0:
+    #         continue
  
  
+    #     if False:
+    #         if not dictionary.check(t):
+    #             try:
+    #                 change_to = dictionary.suggest(t)[0].lower()
+    #                 if change_to != t.lower():
+    #                     change[t] = change_to
+    #             except IndexError:
+    #                 pass
  
  
-JVM.attachCurrentThread()
-_search = None
-
-
-def get_search():
-    global _search
+    # if change == {}:
+    #     return None
  
  
-    while _search is False:
-        sleep(1)
+    # for frm, to in change.items():
+    #     query = query.replace(frm, to)
  
  
-    if _search is None:
-        _search = False
-        _search = Search()
-    return _search
+    # return query
  
  
  def hint(request):
      prefix = request.GET.get('term', '')
      if len(prefix) < 2:
          return JSONResponse([])
  
  
  def hint(request):
      prefix = request.GET.get('term', '')
      if len(prefix) < 2:
          return JSONResponse([])
-    JVM.attachCurrentThread()
-
-    search = get_search()
-    hint = search.hint()
-    try:
-        tags = request.GET.get('tags', '')
-        hint.tags(Tag.get_tag_list(tags))
-    except:
-        pass
  
  
+    search = Search()
      # tagi beda ograniczac tutaj
      # ale tagi moga byc na ksiazce i na fragmentach
      # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
      # tagi beda ograniczac tutaj
      # ale tagi moga byc na ksiazce i na fragmentach
      # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
@@ -90,79 +68,83 @@ def hint(request):
      tags = search.hint_tags(prefix, pdcounter=True)
      books = search.hint_books(prefix)
  
      tags = search.hint_tags(prefix, pdcounter=True)
      books = search.hint_books(prefix)
  
+    def is_dupe(tag):
+        if isinstance(tag, PDCounterAuthor):
+            if filter(lambda t: t.slug == tag.slug and t != tag, tags):
+                return True
+        elif isinstance(tag, PDCounterBook):
+            if filter(lambda b: b.slug == tag.slug, tags):
+                return True
+        return False
+
+    tags = filter(lambda t: not is_dupe(t), tags)
+
      def category_name(c):
          if c.startswith('pd_'):
              c = c[len('pd_'):]
          return _(c)
  
      def category_name(c):
          if c.startswith('pd_'):
              c = c[len('pd_'):]
          return _(c)
  
-    return JSONResponse(
-        [{'label': t.name,
-          'category': category_name(t.category),
-          'id': t.id,
-          'url': t.get_absolute_url()}
-          for t in tags] + \
-          [{'label': b.title,
-            'category': _('book'),
-            'id': b.id,
-            'url': b.get_absolute_url()}
-            for b in books])
+    callback = request.GET.get('callback', None)
+    data = [{'label': t.name,
+              'category': category_name(t.category),
+              'id': t.id,
+              'url': t.get_absolute_url()}
+              for t in tags] + \
+              [{'label': b.title,
+                'category': _('book'),
+                'id': b.id,
+                'url': b.get_absolute_url()}
+                for b in books]
+    if callback:
+        return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
+                            content_type="application/json; charset=utf-8")
+    else:
+        return JSONResponse(data)
  
  
  def main(request):
      results = {}
  
  
  def main(request):
      results = {}
-    JVM.attachCurrentThread()  # where to put this?
  
      results = None
      query = None
  
      results = None
      query = None
-    fuzzy = False #0.8
  
  
-    query = request.GET.get('q','')
-    # book_id = request.GET.get('book', None)
-    # book = None
-    # if book_id is not None:
-    #     book = get_object_or_404(Book, id=book_id)
-
-    # hint = search.hint()
-    # try:
-    #     tag_list = Tag.get_tag_list(tags)
-    # except:
-    #     tag_list = []
+    query = request.GET.get('q', '')
  
      if len(query) < 2:
  
      if len(query) < 2:
-        return render_to_response('catalogue/search_too_short.html', {'prefix': query},
-                                  context_instance=RequestContext(request))
-
-    search = get_search()
-    # hint.tags(tag_list)
-    # if book:
-    #     hint.books(book)
-    tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
-    tags = split_tags(tags)
+        return render_to_response('catalogue/search_too_short.html',
+                                  {'prefix': query},
+            context_instance=RequestContext(request))
+    search = Search()
  
  
-    toks = StringReader(query)
-    tokens_cache = {}
+    theme_terms = search.index.analyze(text=query, field="themes_pl") \
+        + search.index.analyze(text=query, field="themes")
  
  
-    author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
-    title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
+            # change hints
+    tags = search.hint_tags(query, pdcounter=True, prefix=False)
+    tags = split_tags(tags)
+
+    author_results = search.search_phrase(query, 'authors', book=True)
+    title_results = search.search_phrase(query, 'title', book=True)
  
      # Boost main author/title results with mixed search, and save some of its results for end of list.
      # boost author, title results
  
      # Boost main author/title results with mixed search, and save some of its results for end of list.
      # boost author, title results
-    author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
+    author_title_mixed = search.search_some(query, ['authors', 'title', 'tags'], query_terms=theme_terms)
      author_title_rest = []
      author_title_rest = []
+
      for b in author_title_mixed:
      for b in author_title_mixed:
-        bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
-        for b2 in bks:
+        also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
+        for b2 in also_in_mixed:
              b2.boost *= 1.1
              b2.boost *= 1.1
-        if bks is []:
+        if also_in_mixed is []:
              author_title_rest.append(b)
  
      # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
      # Because the query is using only one field.
      text_phrase = SearchResult.aggregate(
              author_title_rest.append(b)
  
      # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
      # Because the query is using only one field.
      text_phrase = SearchResult.aggregate(
-        search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
-        search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
+        search.search_phrase(query, 'text', snippets=True, book=False),
+        search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms))
  
  
-    everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
+    everywhere = search.search_everywhere(query, query_terms=theme_terms)
  
      def already_found(results):
          def f(e):
  
      def already_found(results):
          def f(e):
@@ -181,15 +163,15 @@ def main(request):
  
      everywhere = SearchResult.aggregate(everywhere, author_title_rest)
  
  
      everywhere = SearchResult.aggregate(everywhere, author_title_rest)
  
-    for res in [author_results, title_results, text_phrase, everywhere]:
+    for field, res in [('authors', author_results),
+                       ('title', title_results),
+                       ('text', text_phrase),
+                       ('text', everywhere)]:
          res.sort(reverse=True)
          for r in res:
          res.sort(reverse=True)
          for r in res:
-            for h in r.hits:
-                h['snippets'] = map(lambda s:
-                                    re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
-                                            re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
+            search.get_snippets(r, query, field, 3)
  
  
-    suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
+    suggestion = u''
  
      def ensure_exists(r):
          try:
  
      def ensure_exists(r):
          try:
@@ -206,14 +188,15 @@ def main(request):
      # ensure books do exists & sort them
      results.sort(reverse=True)
  
      # ensure books do exists & sort them
      results.sort(reverse=True)
  
-    if len(results) == 1:
-        fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
-        if len(fragment_hits) == 1:
-            #anchor = fragment_hits[0]['fragment']
-            #frag = Fragment.objects.get(anchor=anchor)
-            return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
-        return HttpResponseRedirect(results[0].book.get_absolute_url())
-    elif len(results) == 0:
+    # We don't want to redirect to book text, but rather display result page even with one result.
+    # if len(results) == 1:
+    #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
+    #     if len(fragment_hits) == 1:
+    #         #anchor = fragment_hits[0]['fragment']
+    #         #frag = Fragment.objects.get(anchor=anchor)
+    #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
+    #     return HttpResponseRedirect(results[0].book.get_absolute_url())
+    if len(results) == 0:
          form = PublishingSuggestForm(initial={"books": query + ", "})
          return render_to_response('catalogue/search_no_hits.html',
                                    {'tags': tags,
          form = PublishingSuggestForm(initial={"books": query + ", "})
          return render_to_response('catalogue/search_no_hits.html',
                                    {'tags': tags,
@@ -225,9 +208,9 @@ def main(request):
      return render_to_response('catalogue/search_multiple_hits.html',
                                {'tags': tags,
                                 'prefix': query,
      return render_to_response('catalogue/search_multiple_hits.html',
                                {'tags': tags,
                                 'prefix': query,
-                               'results': { 'author': author_results,
-                                            'title': title_results,
-                                            'content': text_phrase,
-                                            'other': everywhere},
+                               'results': {'author': author_results,
+                                           'title': title_results,
+                                           'content': text_phrase,
+                                           'other': everywhere},
                                 'did_you_mean': suggestion},
          context_instance=RequestContext(request))
                                 'did_you_mean': suggestion},
          context_instance=RequestContext(request))