src/search/views.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from django.conf import settings
   5 from django.http.response import HttpResponseRedirect
   6 from django.shortcuts import render
   7 from django.views.decorators import cache
   8 from django.http import HttpResponse, JsonResponse
   9
  10 from catalogue.models import Book, Tag
  11 from pdcounter.models import Author
  12 from picture.models import Picture
  13 from search.index import Search, SearchResult, PictureResult
  14 from suggest.forms import PublishingSuggestForm
  15 import re
  16 import json
  17
  18 from wolnelektury.utils import re_escape
  19
  20
  21 def match_word_re(word):
  22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  23         return r"\b%s\b" % word
  24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  25         return "[[:<:]]%s[[:>:]]" % word
  26
  27
  28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  29
  30
  31 def remove_query_syntax_chars(query, replace=' '):
  32     return query_syntax_chars.sub(replace, query)
  33
  34
  35 def did_you_mean(query, tokens):
  36     return query
  37     # change = {}
  38     # for t in tokens:
  39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  40     #     if len(authors) > 0:
  41     #         continue
  42
  43     #     if False:
  44     #         if not dictionary.check(t):
  45     #             try:
  46     #                 change_to = dictionary.suggest(t)[0].lower()
  47     #                 if change_to != t.lower():
  48     #                     change[t] = change_to
  49     #             except IndexError:
  50     #                 pass
  51
  52     # if change == {}:
  53     #     return None
  54
  55     # for frm, to in change.items():
  56     #     query = query.replace(frm, to)
  57
  58     # return query
  59
  60
  61 @cache.never_cache
  62 def hint(request, mozhint=False, param='term'):
  63     prefix = request.GET.get(param, '')
  64     if len(prefix) < 2:
  65         return JsonResponse([], safe=False)
  66
  67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  68
  69     try:
  70         limit = int(request.GET.get('max', ''))
  71     except ValueError:
  72         limit = 20
  73     else:
  74         if limit < 1:
  75             limit = 20
  76
  77     authors = Tag.objects.filter(
  78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  79     data = [
  80         {
  81             'label': author.name,
  82             'id': author.id,
  83             'url': author.get_absolute_url(),
  84         }
  85         for author in authors[:limit]
  86     ]
  87     if len(data) < limit:
  88         for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]:
  89             author_str = b.author_unicode()
  90             translator = b.translator()
  91             if translator:
  92                 author_str += ' (tłum. ' + translator + ')'
  93             data.append(
  94                 {
  95                     'label': b.title,
  96                     'author': author_str,
  97                     'id': b.id,
  98                     'url': b.get_absolute_url()
  99                 }
 100             )
 101
 102     if mozhint:
 103         data = [
 104             prefix,
 105             [
 106                 item['label']
 107                 for item in data
 108             ]
 109         ]
 110
 111     callback = request.GET.get('callback', None)
 112     if callback:
 113         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 114                             content_type="application/json; charset=utf-8")
 115     else:
 116         return JsonResponse(data, safe=False)
 117
 118
 119 @cache.never_cache
 120 def main(request):
 121     query = request.GET.get('q', '')
 122
 123     format = request.GET.get('format')
 124     lang = request.GET.get('lang')
 125     epoch = request.GET.get('epoch')
 126     kind = request.GET.get('kind')
 127     genre = request.GET.get('genre')
 128
 129     if len(query) < 2:
 130         return render(
 131             request, 'catalogue/search_too_short.html',
 132             {'prefix': query})
 133     elif len(query) > 256:
 134         return render(
 135             request, 'catalogue/search_too_long.html',
 136             {'prefix': query})
 137
 138     query = prepare_query(query)
 139     if not (format or lang or epoch or kind or genre):
 140         pd_authors = search_pd_authors(query)
 141     else:
 142         pd_authors = []
 143     if not format or format != 'obraz':
 144         books = search_books(
 145             query,
 146             lang=lang,
 147             only_audio=format=='audio',
 148             only_synchro=format=='synchro',
 149             epoch=epoch,
 150             kind=kind,
 151             genre=genre
 152         )
 153     else:
 154         books = []
 155     if (not format or format == 'obraz') and not lang:
 156         pictures = search_pictures(
 157             query,
 158             epoch=epoch,
 159             kind=kind,
 160             genre=genre
 161         )
 162     else:
 163         pictures = []
 164
 165     suggestion = ''
 166
 167     if not (books or pictures or pd_authors):
 168         form = PublishingSuggestForm(initial={"books": query + ", "})
 169         return render(
 170             request,
 171             'catalogue/search_no_hits.html',
 172             {
 173                 'form': form,
 174                 'did_you_mean': suggestion
 175             })
 176
 177     if not (books or pictures) and len(pd_authors) == 1:
 178         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 179
 180     return render(
 181         request,
 182         'catalogue/search_multiple_hits.html',
 183         {
 184             'pd_authors': pd_authors,
 185             'books': books,
 186             'pictures': pictures,
 187             'did_you_mean': suggestion,
 188             'set': {
 189                 'lang': lang,
 190                 'format': format,
 191                 'epoch': epoch,
 192                 'kind': kind,
 193                 'genre': genre,
 194             },
 195             'tags': {
 196                 'epoch': Tag.objects.filter(category='epoch', for_books=True),
 197                 'genre': Tag.objects.filter(category='genre', for_books=True),
 198                 'kind': Tag.objects.filter(category='kind', for_books=True),
 199             },
 200         })
 201
 202 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
 203     search = Search()
 204     results_parts = []
 205     search_fields = []
 206     words = query.split()
 207     fieldsets = (
 208         (['authors', 'authors_nonstem'], True),
 209         (['title', 'title_nonstem'], True),
 210         (['metadata', 'metadata_nonstem'], True),
 211         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 212     )
 213     for fields, is_book in fieldsets:
 214         search_fields += fields
 215         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 216     results = []
 217     ids_results = {}
 218     for results_part in results_parts:
 219         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 220             book_id = result.book_id
 221             if book_id in ids_results:
 222                 ids_results[book_id].merge(result)
 223             else:
 224                 results.append(result)
 225                 ids_results[book_id] = result
 226     descendant_ids = set(
 227         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 228     results = [result for result in results if result.book_id not in descendant_ids]
 229     for result in results:
 230         search.get_snippets(result, query, num=3)
 231
 232     def ensure_exists(r):
 233         try:
 234             if not r.book:
 235                 return False
 236         except Book.DoesNotExist:
 237             return False
 238
 239         if lang and r.book.language != lang:
 240             return False
 241         if only_audio and not r.book.has_mp3_file():
 242             return False
 243         if only_synchro and not r.book.has_daisy_file():
 244             return False
 245         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
 246             return False
 247         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
 248             return False
 249         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
 250             return False
 251
 252         return True
 253
 254     results = [r for r in results if ensure_exists(r)]
 255     return results
 256
 257
 258 def search_pictures(query, epoch=None, kind=None, genre=None):
 259     search = Search()
 260     results_parts = []
 261     search_fields = []
 262     words = query.split()
 263     fieldsets = (
 264         (['authors', 'authors_nonstem'], True),
 265         (['title', 'title_nonstem'], True),
 266         (['metadata', 'metadata_nonstem'], True),
 267         (['themes_pl', 'themes_pl_nonstem'], False),
 268     )
 269     for fields, is_book in fieldsets:
 270         search_fields += fields
 271         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 272     results = []
 273     ids_results = {}
 274     for results_part in results_parts:
 275         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 276             picture_id = result.picture_id
 277             if picture_id in ids_results:
 278                 ids_results[picture_id].merge(result)
 279             else:
 280                 results.append(result)
 281                 ids_results[picture_id] = result
 282
 283     def ensure_exists(r):
 284         try:
 285             if not r.picture:
 286                 return False
 287         except Picture.DoesNotExist:
 288             return False
 289
 290         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
 291             return False
 292         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
 293             return False
 294         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
 295             return False
 296
 297         return True
 298
 299     results = [r for r in results if ensure_exists(r)]
 300     return results
 301
 302
 303 def search_pd_authors(query):
 304     pd_authors = Author.objects.filter(name__icontains=query)
 305     existing_slugs = Tag.objects.filter(
 306         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 307         .values_list('slug', flat=True)
 308     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 309     return pd_authors
 310
 311
 312 def prepare_query(query):
 313     query = ' '.join(query.split())
 314     # filter out private use characters
 315     import unicodedata
 316     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 317     query = remove_query_syntax_chars(query)
 318
 319     words = query.split()
 320     if len(words) > 10:
 321         query = ' '.join(words[:10])
 322     return query