src/search/views.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6 from django.http.response import HttpResponseRedirect
   7 from django.shortcuts import render
   8 from django.views.decorators import cache
   9 from django.http import HttpResponse, JsonResponse
  10
  11 from catalogue.models import Book, Tag
  12 from pdcounter.models import Author
  13 from picture.models import Picture
  14 from search.index import Search, SearchResult, PictureResult
  15 from suggest.forms import PublishingSuggestForm
  16 import re
  17 import json
  18
  19 from wolnelektury.utils import re_escape
  20
  21
  22 def match_word_re(word):
  23     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  24         return r"\b%s\b" % word
  25     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  26         return "[[:<:]]%s[[:>:]]" % word
  27
  28
  29 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  30
  31
  32 def remove_query_syntax_chars(query, replace=' '):
  33     return query_syntax_chars.sub(replace, query)
  34
  35
  36 def did_you_mean(query, tokens):
  37     return query
  38     # change = {}
  39     # for t in tokens:
  40     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  41     #     if len(authors) > 0:
  42     #         continue
  43
  44     #     if False:
  45     #         if not dictionary.check(t):
  46     #             try:
  47     #                 change_to = dictionary.suggest(t)[0].lower()
  48     #                 if change_to != t.lower():
  49     #                     change[t] = change_to
  50     #             except IndexError:
  51     #                 pass
  52
  53     # if change == {}:
  54     #     return None
  55
  56     # for frm, to in change.items():
  57     #     query = query.replace(frm, to)
  58
  59     # return query
  60
  61
  62 @cache.never_cache
  63 def hint(request):
  64     prefix = request.GET.get('term', '')
  65     if len(prefix) < 2:
  66         return JsonResponse([], safe=False)
  67
  68     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  69
  70     try:
  71         limit = int(request.GET.get('max', ''))
  72     except ValueError:
  73         limit = 20
  74     else:
  75         if limit < 1:
  76             limit = 20
  77
  78     authors = Tag.objects.filter(
  79         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  80     data = [
  81         {
  82             'label': author.name,
  83             'id': author.id,
  84             'url': author.get_absolute_url(),
  85         }
  86         for author in authors[:limit]
  87     ]
  88     if len(data) < limit:
  89         data += [
  90             {
  91                 'label': b.title,
  92                 'author': b.author_unicode(),
  93                 'id': b.id,
  94                 'url': b.get_absolute_url()
  95             }
  96             for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
  97         ]
  98     callback = request.GET.get('callback', None)
  99     if callback:
 100         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 101                             content_type="application/json; charset=utf-8")
 102     else:
 103         return JsonResponse(data, safe=False)
 104
 105
 106 @cache.never_cache
 107 def main(request):
 108     query = request.GET.get('q', '')
 109     if len(query) < 2:
 110         return render(
 111             request, 'catalogue/search_too_short.html',
 112             {'prefix': query})
 113     elif len(query) > 256:
 114         return render(
 115             request, 'catalogue/search_too_long.html',
 116             {'prefix': query})
 117
 118     query = prepare_query(query)
 119     pd_authors = search_pd_authors(query)
 120     books = search_books(query)
 121     pictures = search_pictures(query)
 122     suggestion = u''
 123
 124     if not (books or pictures or pd_authors):
 125         form = PublishingSuggestForm(initial={"books": query + ", "})
 126         return render(
 127             request,
 128             'catalogue/search_no_hits.html',
 129             {
 130                 'form': form,
 131                 'did_you_mean': suggestion
 132             })
 133
 134     if not (books or pictures) and len(pd_authors) == 1:
 135         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 136
 137     return render(
 138         request,
 139         'catalogue/search_multiple_hits.html',
 140         {
 141             'pd_authors': pd_authors,
 142             'books': books,
 143             'pictures': pictures,
 144             'did_you_mean': suggestion
 145         })
 146
 147 def search_books(query):
 148     search = Search()
 149     results_parts = []
 150     search_fields = []
 151     words = query.split()
 152     fieldsets = (
 153         (['authors', 'authors_nonstem'], True),
 154         (['title', 'title_nonstem'], True),
 155         (['metadata', 'metadata_nonstem'], True),
 156         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 157     )
 158     for fields, is_book in fieldsets:
 159         search_fields += fields
 160         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 161     results = []
 162     ids_results = {}
 163     for results_part in results_parts:
 164         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 165             book_id = result.book_id
 166             if book_id in ids_results:
 167                 ids_results[book_id].merge(result)
 168             else:
 169                 results.append(result)
 170                 ids_results[book_id] = result
 171     descendant_ids = set(
 172         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 173     results = [result for result in results if result.book_id not in descendant_ids]
 174     for result in results:
 175         search.get_snippets(result, query, num=3)
 176
 177     def ensure_exists(r):
 178         try:
 179             return r.book
 180         except Book.DoesNotExist:
 181             return False
 182
 183     results = filter(ensure_exists, results)
 184     return results
 185
 186
 187 def search_pictures(query):
 188     search = Search()
 189     results_parts = []
 190     search_fields = []
 191     words = query.split()
 192     fieldsets = (
 193         (['authors', 'authors_nonstem'], True),
 194         (['title', 'title_nonstem'], True),
 195         (['metadata', 'metadata_nonstem'], True),
 196         (['themes_pl', 'themes_pl_nonstem'], False),
 197     )
 198     for fields, is_book in fieldsets:
 199         search_fields += fields
 200         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 201     results = []
 202     ids_results = {}
 203     for results_part in results_parts:
 204         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 205             picture_id = result.picture_id
 206             if picture_id in ids_results:
 207                 ids_results[picture_id].merge(result)
 208             else:
 209                 results.append(result)
 210                 ids_results[picture_id] = result
 211
 212     def ensure_exists(r):
 213         try:
 214             return r.picture
 215         except Picture.DoesNotExist:
 216             return False
 217
 218     results = filter(ensure_exists, results)
 219     return results
 220
 221
 222 def search_pd_authors(query):
 223     pd_authors = Author.objects.filter(name__icontains=query)
 224     existing_slugs = Tag.objects.filter(
 225         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 226         .values_list('slug', flat=True)
 227     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 228     return pd_authors
 229
 230
 231 def prepare_query(query):
 232     query = ' '.join(query.split())
 233     # filter out private use characters
 234     import unicodedata
 235     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 236     query = remove_query_syntax_chars(query)
 237
 238     words = query.split()
 239     if len(words) > 10:
 240         query = ' '.join(words[:10])
 241     return query