revamp search hints
[wolnelektury.git] / src / search / views.py
index 20b5e88..70a216e 100644 (file)
@@ -10,7 +10,7 @@ from django.http import HttpResponse, JsonResponse
 from django.utils.translation import ugettext as _
 
 from catalogue.utils import split_tags
 from django.utils.translation import ugettext as _
 
 from catalogue.utils import split_tags
-from catalogue.models import Book
+from catalogue.models import Book, Tag
 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 from search.index import Search, SearchResult
 from suggest.forms import PublishingSuggestForm
 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 from search.index import Search, SearchResult
 from suggest.forms import PublishingSuggestForm
@@ -66,26 +66,6 @@ def hint(request):
 
     prefix = remove_query_syntax_chars(prefix)
 
 
     prefix = remove_query_syntax_chars(prefix)
 
-    search = Search()
-    # tagi beda ograniczac tutaj
-    # ale tagi moga byc na ksiazce i na fragmentach
-    # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
-    # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
-
-    def is_dupe(tag):
-        if isinstance(tag, PDCounterAuthor):
-            if filter(lambda t: t.slug == tag.slug and t != tag, tags):
-                return True
-        elif isinstance(tag, PDCounterBook):
-            if filter(lambda b: b.slug == tag.slug, tags):
-                return True
-        return False
-
-    def category_name(c):
-        if c.startswith('pd_'):
-            c = c[len('pd_'):]
-        return _(c)
-
     try:
         limit = int(request.GET.get('max', ''))
     except ValueError:
     try:
         limit = int(request.GET.get('max', ''))
     except ValueError:
@@ -94,33 +74,25 @@ def hint(request):
         if limit < 1:
             limit = -1
 
         if limit < 1:
             limit = -1
 
-    data = []
-
-    tags = search.hint_tags(prefix, pdcounter=True)
-    tags = filter(lambda t: not is_dupe(t), tags)
-    for t in tags:
-        if not limit:
-            break
-        limit -= 1
-        data.append({
-            'label': t.name,
-            'category': category_name(t.category),
-            'id': t.id,
-            'url': t.get_absolute_url()
-            })
-    if limit:
-        books = search.hint_books(prefix)
-        for b in books:
-            if not limit:
-                break
-            limit -= 1
-            data.append({
-                'label': b.title,
+    data = [
+        {
+            'label': author.name,
+            'category': _('author'),
+            'id': author.id,
+            'url': author.get_absolute_url(),
+        }
+        for author in Tag.objects.filter(category='author', name__iregex='\m' + prefix)[:10]
+    ]
+    if len(data) < limit:
+        data += [
+            {
+                'label': '<cite>%s</cite>, %s' % (b.title, b.author_unicode()),
                 'category': _('book'),
                 'id': b.id,
                 'url': b.get_absolute_url()
                 'category': _('book'),
                 'id': b.id,
                 'url': b.get_absolute_url()
-                })
-
+            }
+            for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
+        ]
     callback = request.GET.get('callback', None)
     if callback:
         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
     callback = request.GET.get('callback', None)
     if callback:
         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
@@ -135,7 +107,7 @@ def main(request):
     query = ' '.join(query.split())
     # filter out private use characters
     import unicodedata
     query = ' '.join(query.split())
     # filter out private use characters
     import unicodedata
-    query = ''.join(ch for ch in query if unicodedata.category(ch) == 'Co')
+    query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 
     if len(query) < 2:
         return render_to_response(
 
     if len(query) < 2:
         return render_to_response(
@@ -146,67 +118,42 @@ def main(request):
             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
 
     query = remove_query_syntax_chars(query)
             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
 
     query = remove_query_syntax_chars(query)
-    
-    search = Search()
 
 
-    theme_terms = search.index.analyze(text=query, field="themes_pl") \
-        + search.index.analyze(text=query, field="themes")
+    words = query.split()
+    if len(words) > 10:
+        query = ' '.join(words[:10])
+
+    search = Search()
 
 
-    # change hints
     tags = search.hint_tags(query, pdcounter=True, prefix=False)
     tags = split_tags(tags)
 
     tags = search.hint_tags(query, pdcounter=True, prefix=False)
     tags = split_tags(tags)
 
-    author_results = search.search_phrase(query, 'authors', book=True)
-    translator_results = search.search_phrase(query, 'translators', book=True)
-
-    title_results = search.search_phrase(query, 'title', book=True)
-
-    # Boost main author/title results with mixed search, and save some of its results for end of list.
-    # boost author, title results
-    author_title_mixed = search.search_some(query, ['authors', 'translators', 'title', 'tags'], query_terms=theme_terms)
-    author_title_rest = []
-
-    for b in author_title_mixed:
-        also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + translator_results + title_results)
-        for b2 in also_in_mixed:
-            b2.boost *= 1.1
-        if also_in_mixed is []:
-            author_title_rest.append(b)
-
-    # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
-    # Because the query is using only one field.
-    text_phrase = SearchResult.aggregate(
-        search.search_phrase(query, 'text', snippets=True, book=False),
-        search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms))
-
-    everywhere = search.search_everywhere(query, query_terms=theme_terms)
-
-    def already_found(results):
-        def f(e):
-            for r in results:
-                if e.book_id == r.book_id:
-                    e.boost = 0.9
-                    results.append(e)
-                    return True
-            return False
-        return f
-    f = already_found(author_results + translator_results + title_results + text_phrase)
-    everywhere = filter(lambda x: not f(x), everywhere)
-
-    author_results = SearchResult.aggregate(author_results)
-    translator_results = SearchResult.aggregate(translator_results)
-    title_results = SearchResult.aggregate(title_results)
-
-    everywhere = SearchResult.aggregate(everywhere, author_title_rest)
-
-    for field, res in [('authors', author_results),
-                       ('translators', translator_results),
-                       ('title', title_results),
-                       ('text', text_phrase),
-                       ('text', everywhere)]:
-        res.sort(reverse=True)
-        for r in res:
-            search.get_snippets(r, query, field, 3)
+    results_parts = []
+
+    search_fields = []
+    fieldsets = (
+        (['authors'], True),
+        (['title'], True),
+        (['metadata'], True),
+        (['text', 'themes_pl'], False),
+    )
+    for fieldset, is_book in fieldsets:
+        search_fields += fieldset
+        results_parts.append(search.search_words(words, search_fields, book=is_book))
+
+    results = []
+    ids_results = {}
+    for results_part in results_parts:
+        for result in sorted(SearchResult.aggregate(results_part), reverse=True):
+            book_id = result.book_id
+            if book_id in ids_results:
+                ids_results[book_id].merge(result)
+            else:
+                results.append(result)
+                ids_results[book_id] = result
+
+    for result in results:
+        search.get_snippets(result, query, num=3)
 
     suggestion = u''
 
 
     suggestion = u''
 
@@ -216,26 +163,9 @@ def main(request):
         except Book.DoesNotExist:
             return False
 
         except Book.DoesNotExist:
             return False
 
-    author_results = filter(ensure_exists, author_results)
-    translator_results = filter(ensure_exists, translator_results)
-    title_results = filter(ensure_exists, title_results)
-    text_phrase = filter(ensure_exists, text_phrase)
-    everywhere = filter(ensure_exists, everywhere)
-
-    results = author_results + translator_results + title_results + text_phrase + everywhere
-    # ensure books do exists & sort them
-    for res in (author_results, translator_results, title_results, text_phrase, everywhere):
-        res.sort(reverse=True)
-
-    # We don't want to redirect to book text, but rather display result page even with one result.
-    # if len(results) == 1:
-    #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
-    #     if len(fragment_hits) == 1:
-    #         #anchor = fragment_hits[0]['fragment']
-    #         #frag = Fragment.objects.get(anchor=anchor)
-    #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
-    #     return HttpResponseRedirect(results[0].book.get_absolute_url())
-    if len(results) == 0:
+    results = filter(ensure_exists, results)
+
+    if not results:
         form = PublishingSuggestForm(initial={"books": query + ", "})
         return render_to_response(
             'catalogue/search_no_hits.html',
         form = PublishingSuggestForm(initial={"books": query + ", "})
         return render_to_response(
             'catalogue/search_no_hits.html',
@@ -250,15 +180,9 @@ def main(request):
     return render_to_response(
         'catalogue/search_multiple_hits.html',
         {
     return render_to_response(
         'catalogue/search_multiple_hits.html',
         {
-            'tags': tags,
+            'tags': tags['author'] + tags['kind'] + tags['genre'] + tags['epoch'] + tags['theme'],
             'prefix': query,
             'prefix': query,
-            'results': {
-                'author': author_results,
-                'translator': translator_results,
-                'title': title_results,
-                'content': text_phrase,
-                'other': everywhere
-            },
+            'results': results,
             'did_you_mean': suggestion
         },
         context_instance=RequestContext(request))
             'did_you_mean': suggestion
         },
         context_instance=RequestContext(request))