src/search/views.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from django.conf import settings
   5 from django.http.response import HttpResponseRedirect
   6 from django.shortcuts import render
   7 from django.views.decorators import cache
   8 from django.http import HttpResponse, JsonResponse
   9
  10 from catalogue.models import Book, Tag
  11 from pdcounter.models import Author
  12 from picture.models import Picture
  13 from search.index import Search, SearchResult, PictureResult
  14 from suggest.forms import PublishingSuggestForm
  15 import re
  16 import json
  17
  18 from wolnelektury.utils import re_escape
  19
  20
  21 def match_word_re(word):
  22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  23         return r"\b%s\b" % word
  24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  25         return "[[:<:]]%s[[:>:]]" % word
  26
  27
  28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  29
  30
  31 def remove_query_syntax_chars(query, replace=' '):
  32     return query_syntax_chars.sub(replace, query)
  33
  34
  35 def did_you_mean(query, tokens):
  36     return query
  37     # change = {}
  38     # for t in tokens:
  39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  40     #     if len(authors) > 0:
  41     #         continue
  42
  43     #     if False:
  44     #         if not dictionary.check(t):
  45     #             try:
  46     #                 change_to = dictionary.suggest(t)[0].lower()
  47     #                 if change_to != t.lower():
  48     #                     change[t] = change_to
  49     #             except IndexError:
  50     #                 pass
  51
  52     # if change == {}:
  53     #     return None
  54
  55     # for frm, to in change.items():
  56     #     query = query.replace(frm, to)
  57
  58     # return query
  59
  60
  61 @cache.never_cache
  62 def hint(request, mozhint=False, param='term'):
  63     prefix = request.GET.get(param, '')
  64     if len(prefix) < 2:
  65         return JsonResponse([], safe=False)
  66
  67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  68
  69     try:
  70         limit = int(request.GET.get('max', ''))
  71     except ValueError:
  72         limit = 20
  73     else:
  74         if limit < 1:
  75             limit = 20
  76
  77     authors = Tag.objects.filter(
  78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  79     data = [
  80         {
  81             'label': author.name,
  82             'id': author.id,
  83             'url': author.get_absolute_url(),
  84         }
  85         for author in authors[:limit]
  86     ]
  87     if len(data) < limit:
  88         data += [
  89             {
  90                 'label': b.title,
  91                 'author': b.author_unicode(),
  92                 'id': b.id,
  93                 'url': b.get_absolute_url()
  94             }
  95             for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]
  96         ]
  97
  98     if mozhint:
  99         data = [
 100             prefix,
 101             [
 102                 item['label']
 103                 for item in data
 104             ]
 105         ]
 106
 107     callback = request.GET.get('callback', None)
 108     if callback:
 109         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 110                             content_type="application/json; charset=utf-8")
 111     else:
 112         return JsonResponse(data, safe=False)
 113
 114
 115 @cache.never_cache
 116 def main(request):
 117     query = request.GET.get('q', '')
 118
 119     format = request.GET.get('format')
 120     lang = request.GET.get('lang')
 121     epoch = request.GET.get('epoch')
 122     kind = request.GET.get('kind')
 123     genre = request.GET.get('genre')
 124
 125     if len(query) < 2:
 126         return render(
 127             request, 'catalogue/search_too_short.html',
 128             {'prefix': query})
 129     elif len(query) > 256:
 130         return render(
 131             request, 'catalogue/search_too_long.html',
 132             {'prefix': query})
 133
 134     query = prepare_query(query)
 135     if not (format or lang or epoch or kind or genre):
 136         pd_authors = search_pd_authors(query)
 137     else:
 138         pd_authors = []
 139     if not format or format != 'obraz':
 140         books = search_books(
 141             query,
 142             lang=lang,
 143             only_audio=format=='audio',
 144             only_synchro=format=='synchro',
 145             epoch=epoch,
 146             kind=kind,
 147             genre=genre
 148         )
 149     else:
 150         books = []
 151     if (not format or format == 'obraz') and not lang:
 152         pictures = search_pictures(
 153             query,
 154             epoch=epoch,
 155             kind=kind,
 156             genre=genre
 157         )
 158     else:
 159         pictures = []
 160
 161     suggestion = ''
 162
 163     if not (books or pictures or pd_authors):
 164         form = PublishingSuggestForm(initial={"books": query + ", "})
 165         return render(
 166             request,
 167             'catalogue/search_no_hits.html',
 168             {
 169                 'form': form,
 170                 'did_you_mean': suggestion
 171             })
 172
 173     if not (books or pictures) and len(pd_authors) == 1:
 174         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 175
 176     return render(
 177         request,
 178         'catalogue/search_multiple_hits.html',
 179         {
 180             'pd_authors': pd_authors,
 181             'books': books,
 182             'pictures': pictures,
 183             'did_you_mean': suggestion,
 184             'set': {
 185                 'lang': lang,
 186                 'format': format,
 187                 'epoch': epoch,
 188                 'kind': kind,
 189                 'genre': genre,
 190             },
 191             'tags': {
 192                 'epoch': Tag.objects.filter(category='epoch', for_books=True),
 193                 'genre': Tag.objects.filter(category='genre', for_books=True),
 194                 'kind': Tag.objects.filter(category='kind', for_books=True),
 195             },
 196         })
 197
 198 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
 199     search = Search()
 200     results_parts = []
 201     search_fields = []
 202     words = query.split()
 203     fieldsets = (
 204         (['authors', 'authors_nonstem'], True),
 205         (['title', 'title_nonstem'], True),
 206         (['metadata', 'metadata_nonstem'], True),
 207         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 208     )
 209     for fields, is_book in fieldsets:
 210         search_fields += fields
 211         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 212     results = []
 213     ids_results = {}
 214     for results_part in results_parts:
 215         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 216             book_id = result.book_id
 217             if book_id in ids_results:
 218                 ids_results[book_id].merge(result)
 219             else:
 220                 results.append(result)
 221                 ids_results[book_id] = result
 222     descendant_ids = set(
 223         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 224     results = [result for result in results if result.book_id not in descendant_ids]
 225     for result in results:
 226         search.get_snippets(result, query, num=3)
 227
 228     def ensure_exists(r):
 229         try:
 230             if not r.book:
 231                 return False
 232         except Book.DoesNotExist:
 233             return False
 234
 235         if lang and r.book.language != lang:
 236             return False
 237         if only_audio and not r.book.has_mp3_file():
 238             return False
 239         if only_synchro and not r.book.has_daisy_file():
 240             return False
 241         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
 242             return False
 243         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
 244             return False
 245         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
 246             return False
 247
 248         return True
 249
 250     results = [r for r in results if ensure_exists(r)]
 251     return results
 252
 253
 254 def search_pictures(query, epoch=None, kind=None, genre=None):
 255     search = Search()
 256     results_parts = []
 257     search_fields = []
 258     words = query.split()
 259     fieldsets = (
 260         (['authors', 'authors_nonstem'], True),
 261         (['title', 'title_nonstem'], True),
 262         (['metadata', 'metadata_nonstem'], True),
 263         (['themes_pl', 'themes_pl_nonstem'], False),
 264     )
 265     for fields, is_book in fieldsets:
 266         search_fields += fields
 267         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 268     results = []
 269     ids_results = {}
 270     for results_part in results_parts:
 271         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 272             picture_id = result.picture_id
 273             if picture_id in ids_results:
 274                 ids_results[picture_id].merge(result)
 275             else:
 276                 results.append(result)
 277                 ids_results[picture_id] = result
 278
 279     def ensure_exists(r):
 280         try:
 281             if not r.picture:
 282                 return False
 283         except Picture.DoesNotExist:
 284             return False
 285
 286         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
 287             return False
 288         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
 289             return False
 290         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
 291             return False
 292
 293         return True
 294
 295     results = [r for r in results if ensure_exists(r)]
 296     return results
 297
 298
 299 def search_pd_authors(query):
 300     pd_authors = Author.objects.filter(name__icontains=query)
 301     existing_slugs = Tag.objects.filter(
 302         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 303         .values_list('slug', flat=True)
 304     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 305     return pd_authors
 306
 307
 308 def prepare_query(query):
 309     query = ' '.join(query.split())
 310     # filter out private use characters
 311     import unicodedata
 312     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 313     query = remove_query_syntax_chars(query)
 314
 315     words = query.split()
 316     if len(words) > 10:
 317         query = ' '.join(words[:10])
 318     return query