src/search/views.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6 from django.http.response import HttpResponseRedirect
   7 from django.shortcuts import render_to_response
   8 from django.template import RequestContext
   9 from django.views.decorators import cache
  10 from django.http import HttpResponse, JsonResponse
  11
  12 from catalogue.models import Book, Tag
  13 from pdcounter.models import Author
  14 from picture.models import Picture
  15 from search.index import Search, SearchResult, PictureResult
  16 from suggest.forms import PublishingSuggestForm
  17 import re
  18 import json
  19
  20 from wolnelektury.utils import re_escape
  21
  22
  23 def match_word_re(word):
  24     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  25         return r"\b%s\b" % word
  26     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  27         return "[[:<:]]%s[[:>:]]" % word
  28
  29
  30 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  31
  32
  33 def remove_query_syntax_chars(query, replace=' '):
  34     return query_syntax_chars.sub(replace, query)
  35
  36
  37 def did_you_mean(query, tokens):
  38     return query
  39     # change = {}
  40     # for t in tokens:
  41     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  42     #     if len(authors) > 0:
  43     #         continue
  44
  45     #     if False:
  46     #         if not dictionary.check(t):
  47     #             try:
  48     #                 change_to = dictionary.suggest(t)[0].lower()
  49     #                 if change_to != t.lower():
  50     #                     change[t] = change_to
  51     #             except IndexError:
  52     #                 pass
  53
  54     # if change == {}:
  55     #     return None
  56
  57     # for frm, to in change.items():
  58     #     query = query.replace(frm, to)
  59
  60     # return query
  61
  62
  63 @cache.never_cache
  64 def hint(request):
  65     prefix = request.GET.get('term', '')
  66     if len(prefix) < 2:
  67         return JsonResponse([], safe=False)
  68
  69     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  70
  71     try:
  72         limit = int(request.GET.get('max', ''))
  73     except ValueError:
  74         limit = 20
  75     else:
  76         if limit < 1:
  77             limit = 20
  78
  79     authors = Tag.objects.filter(
  80         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  81     data = [
  82         {
  83             'label': author.name,
  84             'id': author.id,
  85             'url': author.get_absolute_url(),
  86         }
  87         for author in authors[:limit]
  88     ]
  89     if len(data) < limit:
  90         data += [
  91             {
  92                 'label': b.title,
  93                 'author': b.author_unicode(),
  94                 'id': b.id,
  95                 'url': b.get_absolute_url()
  96             }
  97             for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
  98         ]
  99     callback = request.GET.get('callback', None)
 100     if callback:
 101         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 102                             content_type="application/json; charset=utf-8")
 103     else:
 104         return JsonResponse(data, safe=False)
 105
 106
 107 @cache.never_cache
 108 def main(request):
 109     return HttpResponse('Search is temporarily disabled', status=503)
 110     query = request.GET.get('q', '')
 111     if len(query) < 2:
 112         return render_to_response(
 113             'catalogue/search_too_short.html', {'prefix': query},
 114             context_instance=RequestContext(request))
 115     elif len(query) > 256:
 116         return render_to_response(
 117             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
 118
 119     query = prepare_query(query)
 120     pd_authors = search_pd_authors(query)
 121     books = search_books(query)
 122     pictures = search_pictures(query)
 123     suggestion = u''
 124
 125     if not (books or pictures or pd_authors):
 126         form = PublishingSuggestForm(initial={"books": query + ", "})
 127         return render_to_response(
 128             'catalogue/search_no_hits.html',
 129             {
 130                 'form': form,
 131                 'did_you_mean': suggestion
 132             },
 133             context_instance=RequestContext(request))
 134
 135     if not (books or pictures) and len(pd_authors) == 1:
 136         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 137
 138     return render_to_response(
 139         'catalogue/search_multiple_hits.html',
 140         {
 141             'pd_authors': pd_authors,
 142             'books': books,
 143             'pictures': pictures,
 144             'did_you_mean': suggestion
 145         },
 146         context_instance=RequestContext(request))
 147
 148
 149 def search_books(query):
 150     search = Search()
 151     results_parts = []
 152     search_fields = []
 153     words = query.split()
 154     fieldsets = (
 155         (['authors'], True),
 156         (['title'], True),
 157         (['metadata'], True),
 158         (['text', 'themes_pl'], False),
 159     )
 160     for fields, is_book in fieldsets:
 161         search_fields += fields
 162         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 163     results = []
 164     ids_results = {}
 165     for results_part in results_parts:
 166         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 167             book_id = result.book_id
 168             if book_id in ids_results:
 169                 ids_results[book_id].merge(result)
 170             else:
 171                 results.append(result)
 172                 ids_results[book_id] = result
 173     descendant_ids = set(
 174         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 175     results = [result for result in results if result.book_id not in descendant_ids]
 176     for result in results:
 177         search.get_snippets(result, query, num=3)
 178
 179     def ensure_exists(r):
 180         try:
 181             return r.book
 182         except Book.DoesNotExist:
 183             return False
 184
 185     results = filter(ensure_exists, results)
 186     return results
 187
 188
 189 def search_pictures(query):
 190     search = Search()
 191     results_parts = []
 192     search_fields = []
 193     words = query.split()
 194     fieldsets = (
 195         (['authors'], True),
 196         (['title'], True),
 197         (['metadata'], True),
 198         (['themes_pl'], False),
 199     )
 200     for fields, is_book in fieldsets:
 201         search_fields += fields
 202         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 203     results = []
 204     ids_results = {}
 205     for results_part in results_parts:
 206         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 207             picture_id = result.picture_id
 208             if picture_id in ids_results:
 209                 ids_results[picture_id].merge(result)
 210             else:
 211                 results.append(result)
 212                 ids_results[picture_id] = result
 213
 214     def ensure_exists(r):
 215         try:
 216             return r.picture
 217         except Picture.DoesNotExist:
 218             return False
 219
 220     results = filter(ensure_exists, results)
 221     return results
 222
 223
 224 def search_pd_authors(query):
 225     pd_authors = Author.objects.filter(name__icontains=query)
 226     existing_slugs = Tag.objects.filter(
 227         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 228         .values_list('slug', flat=True)
 229     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 230     return pd_authors
 231
 232
 233 def prepare_query(query):
 234     query = ' '.join(query.split())
 235     # filter out private use characters
 236     import unicodedata
 237     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 238     query = remove_query_syntax_chars(query)
 239
 240     words = query.split()
 241     if len(words) > 10:
 242         query = ' '.join(words[:10])
 243     return query