src/search/views.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6 from django.http.response import HttpResponseRedirect
   7 from django.shortcuts import render_to_response
   8 from django.template import RequestContext
   9 from django.views.decorators import cache
  10 from django.http import HttpResponse, JsonResponse
  11
  12 from catalogue.models import Book, Tag
  13 from pdcounter.models import Author
  14 from picture.models import Picture
  15 from search.index import Search, SearchResult, PictureResult
  16 from suggest.forms import PublishingSuggestForm
  17 import re
  18 import json
  19
  20 from wolnelektury.utils import re_escape
  21
  22
  23 def match_word_re(word):
  24     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  25         return r"\b%s\b" % word
  26     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  27         return "[[:<:]]%s[[:>:]]" % word
  28
  29
  30 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  31
  32
  33 def remove_query_syntax_chars(query, replace=' '):
  34     return query_syntax_chars.sub(replace, query)
  35
  36
  37 def did_you_mean(query, tokens):
  38     return query
  39     # change = {}
  40     # for t in tokens:
  41     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  42     #     if len(authors) > 0:
  43     #         continue
  44
  45     #     if False:
  46     #         if not dictionary.check(t):
  47     #             try:
  48     #                 change_to = dictionary.suggest(t)[0].lower()
  49     #                 if change_to != t.lower():
  50     #                     change[t] = change_to
  51     #             except IndexError:
  52     #                 pass
  53
  54     # if change == {}:
  55     #     return None
  56
  57     # for frm, to in change.items():
  58     #     query = query.replace(frm, to)
  59
  60     # return query
  61
  62
  63 @cache.never_cache
  64 def hint(request):
  65     prefix = request.GET.get('term', '')
  66     if len(prefix) < 2:
  67         return JsonResponse([], safe=False)
  68
  69     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  70
  71     try:
  72         limit = int(request.GET.get('max', ''))
  73     except ValueError:
  74         limit = 20
  75     else:
  76         if limit < 1:
  77             limit = 20
  78
  79     authors = Tag.objects.filter(
  80         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  81     data = [
  82         {
  83             'label': author.name,
  84             'id': author.id,
  85             'url': author.get_absolute_url(),
  86         }
  87         for author in authors[:limit]
  88     ]
  89     if len(data) < limit:
  90         data += [
  91             {
  92                 'label': b.title,
  93                 'author': b.author_unicode(),
  94                 'id': b.id,
  95                 'url': b.get_absolute_url()
  96             }
  97             for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
  98         ]
  99     callback = request.GET.get('callback', None)
 100     if callback:
 101         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 102                             content_type="application/json; charset=utf-8")
 103     else:
 104         return JsonResponse(data, safe=False)
 105
 106
 107 @cache.never_cache
 108 def main(request):
 109     query = request.GET.get('q', '')
 110     if len(query) < 2:
 111         return render_to_response(
 112             'catalogue/search_too_short.html', {'prefix': query},
 113             context_instance=RequestContext(request))
 114     elif len(query) > 256:
 115         return render_to_response(
 116             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
 117
 118     query = prepare_query(query)
 119     pd_authors = search_pd_authors(query)
 120     books = search_books(query)
 121     pictures = search_pictures(query)
 122     suggestion = u''
 123
 124     if not (books or pictures or pd_authors):
 125         form = PublishingSuggestForm(initial={"books": query + ", "})
 126         return render_to_response(
 127             'catalogue/search_no_hits.html',
 128             {
 129                 'form': form,
 130                 'did_you_mean': suggestion
 131             },
 132             context_instance=RequestContext(request))
 133
 134     if not (books or pictures) and len(pd_authors) == 1:
 135         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 136
 137     return render_to_response(
 138         'catalogue/search_multiple_hits.html',
 139         {
 140             'pd_authors': pd_authors,
 141             'books': books,
 142             'pictures': pictures,
 143             'did_you_mean': suggestion
 144         },
 145         context_instance=RequestContext(request))
 146
 147
 148 def search_books(query):
 149     search = Search()
 150     results_parts = []
 151     search_fields = []
 152     words = query.split()
 153     fieldsets = (
 154         (['authors', 'authors_nonstem'], True),
 155         (['title', 'title_nonstem'], True),
 156         (['metadata', 'metadata_nonstem'], True),
 157         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 158     )
 159     for fields, is_book in fieldsets:
 160         search_fields += fields
 161         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 162     results = []
 163     ids_results = {}
 164     for results_part in results_parts:
 165         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 166             book_id = result.book_id
 167             if book_id in ids_results:
 168                 ids_results[book_id].merge(result)
 169             else:
 170                 results.append(result)
 171                 ids_results[book_id] = result
 172     descendant_ids = set(
 173         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 174     results = [result for result in results if result.book_id not in descendant_ids]
 175     for result in results:
 176         search.get_snippets(result, query, num=3)
 177
 178     def ensure_exists(r):
 179         try:
 180             return r.book
 181         except Book.DoesNotExist:
 182             return False
 183
 184     results = filter(ensure_exists, results)
 185     return results
 186
 187
 188 def search_pictures(query):
 189     search = Search()
 190     results_parts = []
 191     search_fields = []
 192     words = query.split()
 193     fieldsets = (
 194         (['authors', 'authors_nonstem'], True),
 195         (['title', 'title_nonstem'], True),
 196         (['metadata', 'metadata_nonstem'], True),
 197         (['themes_pl', 'themes_pl_nonstem'], False),
 198     )
 199     for fields, is_book in fieldsets:
 200         search_fields += fields
 201         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 202     results = []
 203     ids_results = {}
 204     for results_part in results_parts:
 205         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 206             picture_id = result.picture_id
 207             if picture_id in ids_results:
 208                 ids_results[picture_id].merge(result)
 209             else:
 210                 results.append(result)
 211                 ids_results[picture_id] = result
 212
 213     def ensure_exists(r):
 214         try:
 215             return r.picture
 216         except Picture.DoesNotExist:
 217             return False
 218
 219     results = filter(ensure_exists, results)
 220     return results
 221
 222
 223 def search_pd_authors(query):
 224     pd_authors = Author.objects.filter(name__icontains=query)
 225     existing_slugs = Tag.objects.filter(
 226         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 227         .values_list('slug', flat=True)
 228     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 229     return pd_authors
 230
 231
 232 def prepare_query(query):
 233     query = ' '.join(query.split())
 234     # filter out private use characters
 235     import unicodedata
 236     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 237     query = remove_query_syntax_chars(query)
 238
 239     words = query.split()
 240     if len(words) > 10:
 241         query = ' '.join(words[:10])
 242     return query