src/search/views.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from django.conf import settings
   5 from django.http.response import HttpResponseRedirect
   6 from django.shortcuts import render
   7 from django.views.decorators import cache
   8 from django.http import HttpResponse, JsonResponse
   9
  10 from catalogue.models import Book, Tag
  11 from pdcounter.models import Author
  12 from picture.models import Picture
  13 from search.index import Search, SearchResult, PictureResult
  14 from suggest.forms import PublishingSuggestForm
  15 import re
  16 import json
  17
  18 from wolnelektury.utils import re_escape
  19
  20
  21 def match_word_re(word):
  22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  23         return r"\b%s\b" % word
  24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  25         return "[[:<:]]%s[[:>:]]" % word
  26
  27
  28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  29
  30
  31 def remove_query_syntax_chars(query, replace=' '):
  32     return query_syntax_chars.sub(replace, query)
  33
  34
  35 def did_you_mean(query, tokens):
  36     return query
  37     # change = {}
  38     # for t in tokens:
  39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  40     #     if len(authors) > 0:
  41     #         continue
  42
  43     #     if False:
  44     #         if not dictionary.check(t):
  45     #             try:
  46     #                 change_to = dictionary.suggest(t)[0].lower()
  47     #                 if change_to != t.lower():
  48     #                     change[t] = change_to
  49     #             except IndexError:
  50     #                 pass
  51
  52     # if change == {}:
  53     #     return None
  54
  55     # for frm, to in change.items():
  56     #     query = query.replace(frm, to)
  57
  58     # return query
  59
  60
  61 @cache.never_cache
  62 def hint(request, mozhint=False, param='term'):
  63     prefix = request.GET.get(param, '')
  64     if len(prefix) < 2:
  65         return JsonResponse([], safe=False)
  66
  67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  68
  69     try:
  70         limit = int(request.GET.get('max', ''))
  71     except ValueError:
  72         limit = 20
  73     else:
  74         if limit < 1:
  75             limit = 20
  76
  77     authors = Tag.objects.filter(
  78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  79     data = [
  80         {
  81             'label': author.name,
  82             'id': author.id,
  83             'url': author.get_absolute_url(),
  84         }
  85         for author in authors[:limit]
  86     ]
  87     if len(data) < limit:
  88         data += [
  89             {
  90                 'label': b.title,
  91                 'author': b.author_unicode(),
  92                 'id': b.id,
  93                 'url': b.get_absolute_url()
  94             }
  95             for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]
  96         ]
  97
  98     if mozhint:
  99         data = [
 100             prefix,
 101             [
 102                 item['label']
 103                 for item in data
 104             ]
 105         ]
 106
 107     callback = request.GET.get('callback', None)
 108     if callback:
 109         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 110                             content_type="application/json; charset=utf-8")
 111     else:
 112         return JsonResponse(data, safe=False)
 113
 114
 115 @cache.never_cache
 116 def main(request):
 117     query = request.GET.get('q', '')
 118     if len(query) < 2:
 119         return render(
 120             request, 'catalogue/search_too_short.html',
 121             {'prefix': query})
 122     elif len(query) > 256:
 123         return render(
 124             request, 'catalogue/search_too_long.html',
 125             {'prefix': query})
 126
 127     query = prepare_query(query)
 128     pd_authors = search_pd_authors(query)
 129     books = search_books(query)
 130     pictures = search_pictures(query)
 131     suggestion = ''
 132
 133     if not (books or pictures or pd_authors):
 134         form = PublishingSuggestForm(initial={"books": query + ", "})
 135         return render(
 136             request,
 137             'catalogue/search_no_hits.html',
 138             {
 139                 'form': form,
 140                 'did_you_mean': suggestion
 141             })
 142
 143     if not (books or pictures) and len(pd_authors) == 1:
 144         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 145
 146     return render(
 147         request,
 148         'catalogue/search_multiple_hits.html',
 149         {
 150             'pd_authors': pd_authors,
 151             'books': books,
 152             'pictures': pictures,
 153             'did_you_mean': suggestion
 154         })
 155
 156 def search_books(query):
 157     search = Search()
 158     results_parts = []
 159     search_fields = []
 160     words = query.split()
 161     fieldsets = (
 162         (['authors', 'authors_nonstem'], True),
 163         (['title', 'title_nonstem'], True),
 164         (['metadata', 'metadata_nonstem'], True),
 165         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 166     )
 167     for fields, is_book in fieldsets:
 168         search_fields += fields
 169         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 170     results = []
 171     ids_results = {}
 172     for results_part in results_parts:
 173         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 174             book_id = result.book_id
 175             if book_id in ids_results:
 176                 ids_results[book_id].merge(result)
 177             else:
 178                 results.append(result)
 179                 ids_results[book_id] = result
 180     descendant_ids = set(
 181         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 182     results = [result for result in results if result.book_id not in descendant_ids]
 183     for result in results:
 184         search.get_snippets(result, query, num=3)
 185
 186     def ensure_exists(r):
 187         try:
 188             return r.book
 189         except Book.DoesNotExist:
 190             return False
 191
 192     results = [r for r in results if ensure_exists(r)]
 193     return results
 194
 195
 196 def search_pictures(query):
 197     search = Search()
 198     results_parts = []
 199     search_fields = []
 200     words = query.split()
 201     fieldsets = (
 202         (['authors', 'authors_nonstem'], True),
 203         (['title', 'title_nonstem'], True),
 204         (['metadata', 'metadata_nonstem'], True),
 205         (['themes_pl', 'themes_pl_nonstem'], False),
 206     )
 207     for fields, is_book in fieldsets:
 208         search_fields += fields
 209         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 210     results = []
 211     ids_results = {}
 212     for results_part in results_parts:
 213         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 214             picture_id = result.picture_id
 215             if picture_id in ids_results:
 216                 ids_results[picture_id].merge(result)
 217             else:
 218                 results.append(result)
 219                 ids_results[picture_id] = result
 220
 221     def ensure_exists(r):
 222         try:
 223             return r.picture
 224         except Picture.DoesNotExist:
 225             return False
 226
 227     results = [r for r in results if ensure_exists(r)]
 228     return results
 229
 230
 231 def search_pd_authors(query):
 232     pd_authors = Author.objects.filter(name__icontains=query)
 233     existing_slugs = Tag.objects.filter(
 234         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 235         .values_list('slug', flat=True)
 236     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 237     return pd_authors
 238
 239
 240 def prepare_query(query):
 241     query = ' '.join(query.split())
 242     # filter out private use characters
 243     import unicodedata
 244     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 245     query = remove_query_syntax_chars(query)
 246
 247     words = query.split()
 248     if len(words) > 10:
 249         query = ' '.join(words[:10])
 250     return query