src/search/views.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from django.conf import settings
   5 from django.http.response import HttpResponseRedirect
   6 from django.shortcuts import render
   7 from django.views.decorators import cache
   8 from django.http import HttpResponse, JsonResponse
   9
  10 from catalogue.models import Book, Tag
  11 from pdcounter.models import Author
  12 from picture.models import Picture
  13 from search.index import Search, SearchResult, PictureResult
  14 from .forms import SearchFilters
  15 from suggest.forms import PublishingSuggestForm
  16 import re
  17 import json
  18
  19 from wolnelektury.utils import re_escape
  20
  21
  22 def match_word_re(word):
  23     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  24         return r"\b%s\b" % word
  25     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  26         return "[[:<:]]%s[[:>:]]" % word
  27
  28
  29 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  30
  31
  32 def remove_query_syntax_chars(query, replace=' '):
  33     return query_syntax_chars.sub(replace, query)
  34
  35
  36 def did_you_mean(query, tokens):
  37     return query
  38     # change = {}
  39     # for t in tokens:
  40     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  41     #     if len(authors) > 0:
  42     #         continue
  43
  44     #     if False:
  45     #         if not dictionary.check(t):
  46     #             try:
  47     #                 change_to = dictionary.suggest(t)[0].lower()
  48     #                 if change_to != t.lower():
  49     #                     change[t] = change_to
  50     #             except IndexError:
  51     #                 pass
  52
  53     # if change == {}:
  54     #     return None
  55
  56     # for frm, to in change.items():
  57     #     query = query.replace(frm, to)
  58
  59     # return query
  60
  61
  62 @cache.never_cache
  63 def hint(request, mozhint=False, param='term'):
  64     prefix = request.GET.get(param, '')
  65     if len(prefix) < 2:
  66         return JsonResponse([], safe=False)
  67
  68     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  69
  70     try:
  71         limit = int(request.GET.get('max', ''))
  72     except ValueError:
  73         limit = 20
  74     else:
  75         if limit < 1:
  76             limit = 20
  77
  78     authors = Tag.objects.filter(
  79         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  80     data = [
  81         {
  82             'label': author.name,
  83             'id': author.id,
  84             'url': author.get_absolute_url(),
  85         }
  86         for author in authors[:limit]
  87     ]
  88     if len(data) < limit:
  89         for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]:
  90             author_str = b.author_unicode()
  91             translator = b.translator()
  92             if translator:
  93                 author_str += ' (tłum. ' + translator + ')'
  94             data.append(
  95                 {
  96                     'label': b.title,
  97                     'author': author_str,
  98                     'id': b.id,
  99                     'url': b.get_absolute_url()
 100                 }
 101             )
 102
 103     if mozhint:
 104         data = [
 105             prefix,
 106             [
 107                 item['label']
 108                 for item in data
 109             ]
 110         ]
 111
 112     callback = request.GET.get('callback', None)
 113     if callback:
 114         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 115                             content_type="application/json; charset=utf-8")
 116     else:
 117         return JsonResponse(data, safe=False)
 118
 119
 120
 121 @cache.never_cache
 122 def search(request):
 123     filters = SearchFilters(request.GET)
 124     ctx = {
 125         'title': 'Wynik wyszukiwania',
 126         'query': request.GET.get('q', ''),
 127         'filters': filters,
 128     }
 129     if filters.is_valid():
 130         ctx['results'] = filters.results()
 131         for k, v in ctx['results'].items():
 132             if v:
 133                 ctx['hasresults'] = True
 134                 break
 135     return render(request, 'search/results.html', ctx)
 136
 137
 138 @cache.never_cache
 139 def main(request):
 140     if request.EXPERIMENTS['layout'].value:
 141         return search(request)
 142
 143     query = request.GET.get('q', '')
 144
 145     format = request.GET.get('format')
 146     lang = request.GET.get('lang')
 147     epoch = request.GET.get('epoch')
 148     kind = request.GET.get('kind')
 149     genre = request.GET.get('genre')
 150
 151     if len(query) < 2:
 152         return render(
 153             request, 'catalogue/search_too_short.html',
 154             {'prefix': query})
 155     elif len(query) > 256:
 156         return render(
 157             request, 'catalogue/search_too_long.html',
 158             {'prefix': query})
 159
 160     query = prepare_query(query)
 161     if not (format or lang or epoch or kind or genre):
 162         pd_authors = search_pd_authors(query)
 163     else:
 164         pd_authors = []
 165     if not format or format != 'obraz':
 166         books = search_books(
 167             query,
 168             lang=lang,
 169             only_audio=format=='audio',
 170             only_synchro=format=='synchro',
 171             epoch=epoch,
 172             kind=kind,
 173             genre=genre
 174         )
 175     else:
 176         books = []
 177     if (not format or format == 'obraz') and not lang:
 178         pictures = search_pictures(
 179             query,
 180             epoch=epoch,
 181             kind=kind,
 182             genre=genre
 183         )
 184     else:
 185         pictures = []
 186
 187     suggestion = ''
 188
 189     if not (books or pictures or pd_authors):
 190         form = PublishingSuggestForm(initial={"books": query + ", "})
 191         return render(
 192             request,
 193             'catalogue/search_no_hits.html',
 194             {
 195                 'form': form,
 196                 'did_you_mean': suggestion
 197             })
 198
 199     if not (books or pictures) and len(pd_authors) == 1:
 200         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 201
 202     return render(
 203         request,
 204         'catalogue/search_multiple_hits.html',
 205         {
 206             'pd_authors': pd_authors,
 207             'books': books,
 208             'pictures': pictures,
 209             'did_you_mean': suggestion,
 210             'set': {
 211                 'lang': lang,
 212                 'format': format,
 213                 'epoch': epoch,
 214                 'kind': kind,
 215                 'genre': genre,
 216             },
 217             'tags': {
 218                 'epoch': Tag.objects.filter(category='epoch', for_books=True),
 219                 'genre': Tag.objects.filter(category='genre', for_books=True),
 220                 'kind': Tag.objects.filter(category='kind', for_books=True),
 221             },
 222         })
 223
 224 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
 225     search = Search()
 226     results_parts = []
 227     search_fields = []
 228     words = query.split()
 229     fieldsets = (
 230         (['authors', 'authors_nonstem'], True),
 231         (['title', 'title_nonstem'], True),
 232         (['metadata', 'metadata_nonstem'], True),
 233         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 234     )
 235     for fields, is_book in fieldsets:
 236         search_fields += fields
 237         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 238     results = []
 239     ids_results = {}
 240     for results_part in results_parts:
 241         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 242             book_id = result.book_id
 243             if book_id in ids_results:
 244                 ids_results[book_id].merge(result)
 245             else:
 246                 results.append(result)
 247                 ids_results[book_id] = result
 248     descendant_ids = set(
 249         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 250     results = [result for result in results if result.book_id not in descendant_ids]
 251     for result in results:
 252         search.get_snippets(result, query, num=3)
 253
 254     def ensure_exists(r):
 255         try:
 256             if not r.book:
 257                 return False
 258         except Book.DoesNotExist:
 259             return False
 260
 261         if lang and r.book.language != lang:
 262             return False
 263         if only_audio and not r.book.has_mp3_file():
 264             return False
 265         if only_synchro and not r.book.has_daisy_file():
 266             return False
 267         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
 268             return False
 269         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
 270             return False
 271         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
 272             return False
 273
 274         return True
 275
 276     results = [r for r in results if ensure_exists(r)]
 277     return results
 278
 279
 280 def search_pictures(query, epoch=None, kind=None, genre=None):
 281     search = Search()
 282     results_parts = []
 283     search_fields = []
 284     words = query.split()
 285     fieldsets = (
 286         (['authors', 'authors_nonstem'], True),
 287         (['title', 'title_nonstem'], True),
 288         (['metadata', 'metadata_nonstem'], True),
 289         (['themes_pl', 'themes_pl_nonstem'], False),
 290     )
 291     for fields, is_book in fieldsets:
 292         search_fields += fields
 293         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 294     results = []
 295     ids_results = {}
 296     for results_part in results_parts:
 297         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 298             picture_id = result.picture_id
 299             if picture_id in ids_results:
 300                 ids_results[picture_id].merge(result)
 301             else:
 302                 results.append(result)
 303                 ids_results[picture_id] = result
 304
 305     def ensure_exists(r):
 306         try:
 307             if not r.picture:
 308                 return False
 309         except Picture.DoesNotExist:
 310             return False
 311
 312         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
 313             return False
 314         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
 315             return False
 316         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
 317             return False
 318
 319         return True
 320
 321     results = [r for r in results if ensure_exists(r)]
 322     return results
 323
 324
 325 def search_pd_authors(query):
 326     pd_authors = Author.objects.filter(name__icontains=query)
 327     existing_slugs = Tag.objects.filter(
 328         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 329         .values_list('slug', flat=True)
 330     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 331     return pd_authors
 332
 333
 334 def prepare_query(query):
 335     query = ' '.join(query.split())
 336     # filter out private use characters
 337     import unicodedata
 338     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 339     query = remove_query_syntax_chars(query)
 340
 341     words = query.split()
 342     if len(words) > 10:
 343         query = ' '.join(words[:10])
 344     return query