disable crawling for catalogue pages with multiple tags
[wolnelektury.git] / src / search / views.py
index 67c3fd4..7e512be 100644 (file)
@@ -7,16 +7,16 @@ from django.shortcuts import render_to_response
 from django.template import RequestContext
 from django.views.decorators import cache
 from django.http import HttpResponse, JsonResponse
 from django.template import RequestContext
 from django.views.decorators import cache
 from django.http import HttpResponse, JsonResponse
-from django.utils.translation import ugettext as _
 
 from catalogue.utils import split_tags
 
 from catalogue.utils import split_tags
-from catalogue.models import Book
-from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
+from catalogue.models import Book, Tag
 from search.index import Search, SearchResult
 from suggest.forms import PublishingSuggestForm
 import re
 import json
 
 from search.index import Search, SearchResult
 from suggest.forms import PublishingSuggestForm
 import re
 import json
 
+from wolnelektury.utils import re_escape
+
 
 def match_word_re(word):
     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
 
 def match_word_re(word):
     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
@@ -29,7 +29,7 @@ query_syntax_chars = re.compile(r"[\\/*:(){}]")
 
 
 def remove_query_syntax_chars(query, replace=' '):
 
 
 def remove_query_syntax_chars(query, replace=' '):
-    return query_syntax_chars.sub(' ', query)
+    return query_syntax_chars.sub(replace, query)
 
 
 def did_you_mean(query, tokens):
 
 
 def did_you_mean(query, tokens):
@@ -64,63 +64,36 @@ def hint(request):
     if len(prefix) < 2:
         return JsonResponse([], safe=False)
 
     if len(prefix) < 2:
         return JsonResponse([], safe=False)
 
-    prefix = remove_query_syntax_chars(prefix)
-
-    search = Search()
-    # tagi beda ograniczac tutaj
-    # ale tagi moga byc na ksiazce i na fragmentach
-    # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
-    # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
-
-    def is_dupe(tag):
-        if isinstance(tag, PDCounterAuthor):
-            if filter(lambda t: t.slug == tag.slug and t != tag, tags):
-                return True
-        elif isinstance(tag, PDCounterBook):
-            if filter(lambda b: b.slug == tag.slug, tags):
-                return True
-        return False
-
-    def category_name(c):
-        if c.startswith('pd_'):
-            c = c[len('pd_'):]
-        return _(c)
+    prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
 
     try:
         limit = int(request.GET.get('max', ''))
     except ValueError:
 
     try:
         limit = int(request.GET.get('max', ''))
     except ValueError:
-        limit = -1
+        limit = 20
     else:
         if limit < 1:
     else:
         if limit < 1:
-            limit = -1
-
-    data = []
-
-    tags = search.hint_tags(prefix, pdcounter=True)
-    tags = filter(lambda t: not is_dupe(t), tags)
-    for t in tags:
-        if not limit:
-            break
-        limit -= 1
-        data.append({
-            'label': t.name,
-            'category': category_name(t.category),
-            'id': t.id,
-            'url': t.get_absolute_url()
-            })
-    if limit:
-        books = search.hint_books(prefix)
-        for b in books:
-            if not limit:
-                break
-            limit -= 1
-            data.append({
+            limit = 20
+
+    authors = Tag.objects.filter(
+        category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
+    data = [
+        {
+            'label': author.name,
+            'id': author.id,
+            'url': author.get_absolute_url(),
+        }
+        for author in authors[:limit]
+    ]
+    if len(data) < limit:
+        data += [
+            {
                 'label': b.title,
                 'label': b.title,
-                'category': _('book'),
+                'author': b.author_unicode(),
                 'id': b.id,
                 'url': b.get_absolute_url()
                 'id': b.id,
                 'url': b.get_absolute_url()
-                })
-
+            }
+            for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
+        ]
     callback = request.GET.get('callback', None)
     if callback:
         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
     callback = request.GET.get('callback', None)
     if callback:
         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
@@ -132,6 +105,10 @@ def hint(request):
 @cache.never_cache
 def main(request):
     query = request.GET.get('q', '')
 @cache.never_cache
 def main(request):
     query = request.GET.get('q', '')
+    query = ' '.join(query.split())
+    # filter out private use characters
+    import unicodedata
+    query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 
     if len(query) < 2:
         return render_to_response(
 
     if len(query) < 2:
         return render_to_response(
@@ -142,67 +119,42 @@ def main(request):
             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
 
     query = remove_query_syntax_chars(query)
             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
 
     query = remove_query_syntax_chars(query)
-    
-    search = Search()
 
 
-    theme_terms = search.index.analyze(text=query, field="themes_pl") \
-        + search.index.analyze(text=query, field="themes")
+    words = query.split()
+    if len(words) > 10:
+        query = ' '.join(words[:10])
+
+    search = Search()
 
 
-    # change hints
     tags = search.hint_tags(query, pdcounter=True, prefix=False)
     tags = split_tags(tags)
 
     tags = search.hint_tags(query, pdcounter=True, prefix=False)
     tags = split_tags(tags)
 
-    author_results = search.search_phrase(query, 'authors', book=True)
-    translator_results = search.search_phrase(query, 'translators', book=True)
-
-    title_results = search.search_phrase(query, 'title', book=True)
-
-    # Boost main author/title results with mixed search, and save some of its results for end of list.
-    # boost author, title results
-    author_title_mixed = search.search_some(query, ['authors', 'translators', 'title', 'tags'], query_terms=theme_terms)
-    author_title_rest = []
-
-    for b in author_title_mixed:
-        also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + translator_results + title_results)
-        for b2 in also_in_mixed:
-            b2.boost *= 1.1
-        if also_in_mixed is []:
-            author_title_rest.append(b)
-
-    # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
-    # Because the query is using only one field.
-    text_phrase = SearchResult.aggregate(
-        search.search_phrase(query, 'text', snippets=True, book=False),
-        search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms))
-
-    everywhere = search.search_everywhere(query, query_terms=theme_terms)
-
-    def already_found(results):
-        def f(e):
-            for r in results:
-                if e.book_id == r.book_id:
-                    e.boost = 0.9
-                    results.append(e)
-                    return True
-            return False
-        return f
-    f = already_found(author_results + translator_results + title_results + text_phrase)
-    everywhere = filter(lambda x: not f(x), everywhere)
-
-    author_results = SearchResult.aggregate(author_results)
-    translator_results = SearchResult.aggregate(translator_results)
-    title_results = SearchResult.aggregate(title_results)
-
-    everywhere = SearchResult.aggregate(everywhere, author_title_rest)
-
-    for field, res in [('authors', author_results),
-                       ('translators', translator_results),
-                       ('title', title_results),
-                       ('text', text_phrase),
-                       ('text', everywhere)]:
-        res.sort(reverse=True)
-        for r in res:
-            search.get_snippets(r, query, field, 3)
+    results_parts = []
+
+    search_fields = []
+    fieldsets = (
+        (['authors'], True),
+        (['title'], True),
+        (['metadata'], True),
+        (['text', 'themes_pl'], False),
+    )
+    for fieldset, is_book in fieldsets:
+        search_fields += fieldset
+        results_parts.append(search.search_words(words, search_fields, book=is_book))
+
+    results = []
+    ids_results = {}
+    for results_part in results_parts:
+        for result in sorted(SearchResult.aggregate(results_part), reverse=True):
+            book_id = result.book_id
+            if book_id in ids_results:
+                ids_results[book_id].merge(result)
+            else:
+                results.append(result)
+                ids_results[book_id] = result
+
+    for result in results:
+        search.get_snippets(result, query, num=3)
 
     suggestion = u''
 
 
     suggestion = u''
 
@@ -212,26 +164,9 @@ def main(request):
         except Book.DoesNotExist:
             return False
 
         except Book.DoesNotExist:
             return False
 
-    author_results = filter(ensure_exists, author_results)
-    translator_results = filter(ensure_exists, translator_results)
-    title_results = filter(ensure_exists, title_results)
-    text_phrase = filter(ensure_exists, text_phrase)
-    everywhere = filter(ensure_exists, everywhere)
-
-    results = author_results + translator_results + title_results + text_phrase + everywhere
-    # ensure books do exists & sort them
-    for res in (author_results, translator_results, title_results, text_phrase, everywhere):
-        res.sort(reverse=True)
-
-    # We don't want to redirect to book text, but rather display result page even with one result.
-    # if len(results) == 1:
-    #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
-    #     if len(fragment_hits) == 1:
-    #         #anchor = fragment_hits[0]['fragment']
-    #         #frag = Fragment.objects.get(anchor=anchor)
-    #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
-    #     return HttpResponseRedirect(results[0].book.get_absolute_url())
-    if len(results) == 0:
+    results = filter(ensure_exists, results)
+
+    if not results:
         form = PublishingSuggestForm(initial={"books": query + ", "})
         return render_to_response(
             'catalogue/search_no_hits.html',
         form = PublishingSuggestForm(initial={"books": query + ", "})
         return render_to_response(
             'catalogue/search_no_hits.html',
@@ -246,15 +181,9 @@ def main(request):
     return render_to_response(
         'catalogue/search_multiple_hits.html',
         {
     return render_to_response(
         'catalogue/search_multiple_hits.html',
         {
-            'tags': tags,
+            'tags': tags['author'] + tags['kind'] + tags['genre'] + tags['epoch'] + tags['theme'],
             'prefix': query,
             'prefix': query,
-            'results': {
-                'author': author_results,
-                'translator': translator_results,
-                'title': title_results,
-                'content': text_phrase,
-                'other': everywhere
-            },
+            'results': results,
             'did_you_mean': suggestion
         },
         context_instance=RequestContext(request))
             'did_you_mean': suggestion
         },
         context_instance=RequestContext(request))