src/search/views.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from django.conf import settings
   5 from django.http.response import HttpResponseRedirect
   6 from django.shortcuts import render
   7 from django.views.decorators import cache
   8 from django.http import HttpResponse, JsonResponse
   9
  10 from catalogue.models import Book, Tag
  11 from pdcounter.models import Author
  12 from picture.models import Picture
  13 from search.index import Search, SearchResult, PictureResult
  14 from .forms import SearchFilters
  15 from suggest.forms import PublishingSuggestForm
  16 import re
  17 import json
  18
  19 from wolnelektury.utils import re_escape
  20
  21
  22 def match_word_re(word):
  23     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  24         return r"\b%s\b" % word
  25     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  26         return "[[:<:]]%s[[:>:]]" % word
  27
  28
  29 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  30
  31
  32 def remove_query_syntax_chars(query, replace=' '):
  33     return query_syntax_chars.sub(replace, query)
  34
  35
  36 def did_you_mean(query, tokens):
  37     return query
  38     # change = {}
  39     # for t in tokens:
  40     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  41     #     if len(authors) > 0:
  42     #         continue
  43
  44     #     if False:
  45     #         if not dictionary.check(t):
  46     #             try:
  47     #                 change_to = dictionary.suggest(t)[0].lower()
  48     #                 if change_to != t.lower():
  49     #                     change[t] = change_to
  50     #             except IndexError:
  51     #                 pass
  52
  53     # if change == {}:
  54     #     return None
  55
  56     # for frm, to in change.items():
  57     #     query = query.replace(frm, to)
  58
  59     # return query
  60
  61
  62 @cache.never_cache
  63 def hint(request, mozhint=False, param='term'):
  64     prefix = request.GET.get(param, '')
  65     if len(prefix) < 2:
  66         return JsonResponse([], safe=False)
  67
  68     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  69
  70     try:
  71         limit = int(request.GET.get('max', ''))
  72     except ValueError:
  73         limit = 20
  74     else:
  75         if limit < 1:
  76             limit = 20
  77
  78     authors = Tag.objects.filter(
  79         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  80     data = [
  81         {
  82             'label': author.name,
  83             'id': author.id,
  84             'url': author.get_absolute_url(),
  85         }
  86         for author in authors[:limit]
  87     ]
  88     if len(data) < limit:
  89         for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]:
  90             author_str = b.author_unicode()
  91             translator = b.translator()
  92             if translator:
  93                 author_str += ' (tłum. ' + translator + ')'
  94             data.append(
  95                 {
  96                     'label': b.title,
  97                     'author': author_str,
  98                     'id': b.id,
  99                     'url': b.get_absolute_url()
 100                 }
 101             )
 102
 103     if mozhint:
 104         data = [
 105             prefix,
 106             [
 107                 item['label']
 108                 for item in data
 109             ]
 110         ]
 111
 112     callback = request.GET.get('callback', None)
 113     if callback:
 114         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 115                             content_type="application/json; charset=utf-8")
 116     else:
 117         return JsonResponse(data, safe=False)
 118
 119
 120
 121 @cache.never_cache
 122 def search(request):
 123     filters = SearchFilters(request.GET)
 124     ctx = {
 125         'title': 'Wynik wyszukiwania',
 126         'query': filters.data['q'],
 127         'filters': filters,
 128     }
 129     if filters.is_valid():
 130         ctx['results'] = filters.results()
 131         for k, v in ctx['results'].items():
 132             if v:
 133                 ctx['hasresults'] = True
 134                 break
 135     return render(request, 'search/results.html', ctx)
 136
 137
 138 @cache.never_cache
 139 def main(request):
 140     if request.EXPERIMENTS['search'].value:
 141         request.EXPERIMENTS['layout'].override(True)
 142         return search(request)
 143
 144     query = request.GET.get('q', '')
 145
 146     format = request.GET.get('format')
 147     lang = request.GET.get('lang')
 148     epoch = request.GET.get('epoch')
 149     kind = request.GET.get('kind')
 150     genre = request.GET.get('genre')
 151
 152     if len(query) < 2:
 153         return render(
 154             request, 'catalogue/search_too_short.html',
 155             {'prefix': query})
 156     elif len(query) > 256:
 157         return render(
 158             request, 'catalogue/search_too_long.html',
 159             {'prefix': query})
 160
 161     query = prepare_query(query)
 162     if not (format or lang or epoch or kind or genre):
 163         pd_authors = search_pd_authors(query)
 164     else:
 165         pd_authors = []
 166     if not format or format != 'obraz':
 167         books = search_books(
 168             query,
 169             lang=lang,
 170             only_audio=format=='audio',
 171             only_synchro=format=='synchro',
 172             epoch=epoch,
 173             kind=kind,
 174             genre=genre
 175         )
 176     else:
 177         books = []
 178     if (not format or format == 'obraz') and not lang:
 179         pictures = search_pictures(
 180             query,
 181             epoch=epoch,
 182             kind=kind,
 183             genre=genre
 184         )
 185     else:
 186         pictures = []
 187
 188     suggestion = ''
 189
 190     if not (books or pictures or pd_authors):
 191         form = PublishingSuggestForm(initial={"books": query + ", "})
 192         return render(
 193             request,
 194             'catalogue/search_no_hits.html',
 195             {
 196                 'form': form,
 197                 'did_you_mean': suggestion
 198             })
 199
 200     if not (books or pictures) and len(pd_authors) == 1:
 201         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 202
 203     return render(
 204         request,
 205         'catalogue/search_multiple_hits.html',
 206         {
 207             'pd_authors': pd_authors,
 208             'books': books,
 209             'pictures': pictures,
 210             'did_you_mean': suggestion,
 211             'set': {
 212                 'lang': lang,
 213                 'format': format,
 214                 'epoch': epoch,
 215                 'kind': kind,
 216                 'genre': genre,
 217             },
 218             'tags': {
 219                 'epoch': Tag.objects.filter(category='epoch', for_books=True),
 220                 'genre': Tag.objects.filter(category='genre', for_books=True),
 221                 'kind': Tag.objects.filter(category='kind', for_books=True),
 222             },
 223         })
 224
 225 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
 226     search = Search()
 227     results_parts = []
 228     search_fields = []
 229     words = query.split()
 230     fieldsets = (
 231         (['authors', 'authors_nonstem'], True),
 232         (['title', 'title_nonstem'], True),
 233         (['metadata', 'metadata_nonstem'], True),
 234         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 235     )
 236     for fields, is_book in fieldsets:
 237         search_fields += fields
 238         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 239     results = []
 240     ids_results = {}
 241     for results_part in results_parts:
 242         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 243             book_id = result.book_id
 244             if book_id in ids_results:
 245                 ids_results[book_id].merge(result)
 246             else:
 247                 results.append(result)
 248                 ids_results[book_id] = result
 249     descendant_ids = set(
 250         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 251     results = [result for result in results if result.book_id not in descendant_ids]
 252     for result in results:
 253         search.get_snippets(result, query, num=3)
 254
 255     def ensure_exists(r):
 256         try:
 257             if not r.book:
 258                 return False
 259         except Book.DoesNotExist:
 260             return False
 261
 262         if lang and r.book.language != lang:
 263             return False
 264         if only_audio and not r.book.has_mp3_file():
 265             return False
 266         if only_synchro and not r.book.has_daisy_file():
 267             return False
 268         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
 269             return False
 270         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
 271             return False
 272         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
 273             return False
 274
 275         return True
 276
 277     results = [r for r in results if ensure_exists(r)]
 278     return results
 279
 280
 281 def search_pictures(query, epoch=None, kind=None, genre=None):
 282     search = Search()
 283     results_parts = []
 284     search_fields = []
 285     words = query.split()
 286     fieldsets = (
 287         (['authors', 'authors_nonstem'], True),
 288         (['title', 'title_nonstem'], True),
 289         (['metadata', 'metadata_nonstem'], True),
 290         (['themes_pl', 'themes_pl_nonstem'], False),
 291     )
 292     for fields, is_book in fieldsets:
 293         search_fields += fields
 294         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 295     results = []
 296     ids_results = {}
 297     for results_part in results_parts:
 298         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 299             picture_id = result.picture_id
 300             if picture_id in ids_results:
 301                 ids_results[picture_id].merge(result)
 302             else:
 303                 results.append(result)
 304                 ids_results[picture_id] = result
 305
 306     def ensure_exists(r):
 307         try:
 308             if not r.picture:
 309                 return False
 310         except Picture.DoesNotExist:
 311             return False
 312
 313         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
 314             return False
 315         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
 316             return False
 317         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
 318             return False
 319
 320         return True
 321
 322     results = [r for r in results if ensure_exists(r)]
 323     return results
 324
 325
 326 def search_pd_authors(query):
 327     pd_authors = Author.objects.filter(name__icontains=query)
 328     existing_slugs = Tag.objects.filter(
 329         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 330         .values_list('slug', flat=True)
 331     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 332     return pd_authors
 333
 334
 335 def prepare_query(query):
 336     query = ' '.join(query.split())
 337     # filter out private use characters
 338     import unicodedata
 339     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 340     query = remove_query_syntax_chars(query)
 341
 342     words = query.split()
 343     if len(words) > 10:
 344         query = ' '.join(words[:10])
 345     return query