src/search/views.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from django.conf import settings
   5 from django.http.response import HttpResponseRedirect
   6 from django.shortcuts import render
   7 from django.views.decorators import cache
   8 from django.http import HttpResponse, JsonResponse
   9
  10 from catalogue.models import Book, Tag
  11 from pdcounter.models import Author
  12 from picture.models import Picture
  13 from search.index import Search, SearchResult, PictureResult
  14 from suggest.forms import PublishingSuggestForm
  15 import re
  16 import json
  17
  18 from wolnelektury.utils import re_escape
  19
  20
  21 def match_word_re(word):
  22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  23         return r"\b%s\b" % word
  24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  25         return "[[:<:]]%s[[:>:]]" % word
  26
  27
  28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  29
  30
  31 def remove_query_syntax_chars(query, replace=' '):
  32     return query_syntax_chars.sub(replace, query)
  33
  34
  35 def did_you_mean(query, tokens):
  36     return query
  37     # change = {}
  38     # for t in tokens:
  39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  40     #     if len(authors) > 0:
  41     #         continue
  42
  43     #     if False:
  44     #         if not dictionary.check(t):
  45     #             try:
  46     #                 change_to = dictionary.suggest(t)[0].lower()
  47     #                 if change_to != t.lower():
  48     #                     change[t] = change_to
  49     #             except IndexError:
  50     #                 pass
  51
  52     # if change == {}:
  53     #     return None
  54
  55     # for frm, to in change.items():
  56     #     query = query.replace(frm, to)
  57
  58     # return query
  59
  60
  61 @cache.never_cache
  62 def hint(request):
  63     prefix = request.GET.get('term', '')
  64     if len(prefix) < 2:
  65         return JsonResponse([], safe=False)
  66
  67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  68
  69     try:
  70         limit = int(request.GET.get('max', ''))
  71     except ValueError:
  72         limit = 20
  73     else:
  74         if limit < 1:
  75             limit = 20
  76
  77     authors = Tag.objects.filter(
  78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  79     data = [
  80         {
  81             'label': author.name,
  82             'id': author.id,
  83             'url': author.get_absolute_url(),
  84         }
  85         for author in authors[:limit]
  86     ]
  87     if len(data) < limit:
  88         data += [
  89             {
  90                 'label': b.title,
  91                 'author': b.author_unicode(),
  92                 'id': b.id,
  93                 'url': b.get_absolute_url()
  94             }
  95             for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]
  96         ]
  97     callback = request.GET.get('callback', None)
  98     if callback:
  99         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 100                             content_type="application/json; charset=utf-8")
 101     else:
 102         return JsonResponse(data, safe=False)
 103
 104
 105 @cache.never_cache
 106 def main(request):
 107     query = request.GET.get('q', '')
 108     if len(query) < 2:
 109         return render(
 110             request, 'catalogue/search_too_short.html',
 111             {'prefix': query})
 112     elif len(query) > 256:
 113         return render(
 114             request, 'catalogue/search_too_long.html',
 115             {'prefix': query})
 116
 117     query = prepare_query(query)
 118     pd_authors = search_pd_authors(query)
 119     books = search_books(query)
 120     pictures = search_pictures(query)
 121     suggestion = ''
 122
 123     if not (books or pictures or pd_authors):
 124         form = PublishingSuggestForm(initial={"books": query + ", "})
 125         return render(
 126             request,
 127             'catalogue/search_no_hits.html',
 128             {
 129                 'form': form,
 130                 'did_you_mean': suggestion
 131             })
 132
 133     if not (books or pictures) and len(pd_authors) == 1:
 134         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 135
 136     return render(
 137         request,
 138         'catalogue/search_multiple_hits.html',
 139         {
 140             'pd_authors': pd_authors,
 141             'books': books,
 142             'pictures': pictures,
 143             'did_you_mean': suggestion
 144         })
 145
 146 def search_books(query):
 147     search = Search()
 148     results_parts = []
 149     search_fields = []
 150     words = query.split()
 151     fieldsets = (
 152         (['authors', 'authors_nonstem'], True),
 153         (['title', 'title_nonstem'], True),
 154         (['metadata', 'metadata_nonstem'], True),
 155         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 156     )
 157     for fields, is_book in fieldsets:
 158         search_fields += fields
 159         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 160     results = []
 161     ids_results = {}
 162     for results_part in results_parts:
 163         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 164             book_id = result.book_id
 165             if book_id in ids_results:
 166                 ids_results[book_id].merge(result)
 167             else:
 168                 results.append(result)
 169                 ids_results[book_id] = result
 170     descendant_ids = set(
 171         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 172     results = [result for result in results if result.book_id not in descendant_ids]
 173     for result in results:
 174         search.get_snippets(result, query, num=3)
 175
 176     def ensure_exists(r):
 177         try:
 178             return r.book
 179         except Book.DoesNotExist:
 180             return False
 181
 182     results = [r for r in results if ensure_exists(r)]
 183     return results
 184
 185
 186 def search_pictures(query):
 187     search = Search()
 188     results_parts = []
 189     search_fields = []
 190     words = query.split()
 191     fieldsets = (
 192         (['authors', 'authors_nonstem'], True),
 193         (['title', 'title_nonstem'], True),
 194         (['metadata', 'metadata_nonstem'], True),
 195         (['themes_pl', 'themes_pl_nonstem'], False),
 196     )
 197     for fields, is_book in fieldsets:
 198         search_fields += fields
 199         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 200     results = []
 201     ids_results = {}
 202     for results_part in results_parts:
 203         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 204             picture_id = result.picture_id
 205             if picture_id in ids_results:
 206                 ids_results[picture_id].merge(result)
 207             else:
 208                 results.append(result)
 209                 ids_results[picture_id] = result
 210
 211     def ensure_exists(r):
 212         try:
 213             return r.picture
 214         except Picture.DoesNotExist:
 215             return False
 216
 217     results = [r for r in results if ensure_exists(r)]
 218     return results
 219
 220
 221 def search_pd_authors(query):
 222     pd_authors = Author.objects.filter(name__icontains=query)
 223     existing_slugs = Tag.objects.filter(
 224         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 225         .values_list('slug', flat=True)
 226     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 227     return pd_authors
 228
 229
 230 def prepare_query(query):
 231     query = ' '.join(query.split())
 232     # filter out private use characters
 233     import unicodedata
 234     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 235     query = remove_query_syntax_chars(query)
 236
 237     words = query.split()
 238     if len(words) > 10:
 239         query = ' '.join(words[:10])
 240     return query