src/search/views.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from django.conf import settings
   5 from django.http.response import HttpResponseRedirect
   6 from django.shortcuts import render
   7 from django.views.decorators import cache
   8 from django.http import HttpResponse, JsonResponse
   9
  10 from catalogue.models import Book, Tag
  11 from pdcounter.models import Author
  12 from picture.models import Picture
  13 from search.index import Search, SearchResult, PictureResult
  14 from suggest.forms import PublishingSuggestForm
  15 import re
  16 import json
  17
  18 from wolnelektury.utils import re_escape
  19
  20
  21 def match_word_re(word):
  22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  23         return r"\b%s\b" % word
  24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  25         return "[[:<:]]%s[[:>:]]" % word
  26
  27
  28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  29
  30
  31 def remove_query_syntax_chars(query, replace=' '):
  32     return query_syntax_chars.sub(replace, query)
  33
  34
  35 def did_you_mean(query, tokens):
  36     return query
  37     # change = {}
  38     # for t in tokens:
  39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  40     #     if len(authors) > 0:
  41     #         continue
  42
  43     #     if False:
  44     #         if not dictionary.check(t):
  45     #             try:
  46     #                 change_to = dictionary.suggest(t)[0].lower()
  47     #                 if change_to != t.lower():
  48     #                     change[t] = change_to
  49     #             except IndexError:
  50     #                 pass
  51
  52     # if change == {}:
  53     #     return None
  54
  55     # for frm, to in change.items():
  56     #     query = query.replace(frm, to)
  57
  58     # return query
  59
  60
  61 @cache.never_cache
  62 def hint(request, mozhint=False, param='term'):
  63     prefix = request.GET.get(param, '')
  64     if len(prefix) < 2:
  65         return JsonResponse([], safe=False)
  66
  67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
  68
  69     try:
  70         limit = int(request.GET.get('max', ''))
  71     except ValueError:
  72         limit = 20
  73     else:
  74         if limit < 1:
  75             limit = 20
  76
  77     authors = Tag.objects.filter(
  78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
  79     data = [
  80         {
  81             'label': author.name,
  82             'id': author.id,
  83             'url': author.get_absolute_url(),
  84         }
  85         for author in authors[:limit]
  86     ]
  87     if len(data) < limit:
  88         data += [
  89             {
  90                 'label': b.title,
  91                 'author': b.author_unicode(),
  92                 'id': b.id,
  93                 'url': b.get_absolute_url()
  94             }
  95             for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]
  96         ]
  97
  98     if mozhint:
  99         data = [
 100             prefix,
 101             [
 102                 item['label']
 103                 for item in data
 104             ]
 105         ]
 106
 107     callback = request.GET.get('callback', None)
 108     if callback:
 109         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 110                             content_type="application/json; charset=utf-8")
 111     else:
 112         return JsonResponse(data, safe=False)
 113
 114
 115 @cache.never_cache
 116 def main(request):
 117     query = request.GET.get('q', '')
 118
 119     format = request.GET.get('format')
 120     lang = request.GET.get('lang')
 121     epoch = request.GET.get('epoch')
 122     kind = request.GET.get('kind')
 123     genre = request.GET.get('genre')
 124
 125     if len(query) < 2:
 126         return render(
 127             request, 'catalogue/search_too_short.html',
 128             {'prefix': query})
 129     elif len(query) > 256:
 130         return render(
 131             request, 'catalogue/search_too_long.html',
 132             {'prefix': query})
 133
 134     query = prepare_query(query)
 135     if not (format or lang or epoch or kind or genre):
 136         pd_authors = search_pd_authors(query)
 137     else:
 138         pd_authors = []
 139     if not format or format != 'obraz':
 140         books = search_books(
 141             query,
 142             lang=lang,
 143             only_audio=format=='audio',
 144             only_synchro=format=='synchro',
 145             epoch=epoch,
 146             kind=kind,
 147             genre=genre
 148         )
 149     else:
 150         books = []
 151     if (not format or format == 'obraz') and not lang:
 152         pictures = search_pictures(
 153             query,
 154             epoch=epoch,
 155             kind=kind,
 156             genre=genre
 157         )
 158     else:
 159         pictures = []
 160
 161     suggestion = ''
 162
 163     if not (books or pictures or pd_authors):
 164         form = PublishingSuggestForm(initial={"books": query + ", "})
 165         return render(
 166             request,
 167             'catalogue/search_no_hits.html',
 168             {
 169                 'form': form,
 170                 'did_you_mean': suggestion
 171             })
 172
 173     if not (books or pictures) and len(pd_authors) == 1:
 174         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
 175
 176     return render(
 177         request,
 178         'catalogue/search_multiple_hits.html',
 179         {
 180             'pd_authors': pd_authors,
 181             'books': books,
 182             'pictures': pictures,
 183             'did_you_mean': suggestion,
 184             'set': {
 185                 'lang': lang,
 186                 'format': format,
 187                 'epoch': epoch,
 188                 'kind': kind,
 189                 'genre': genre,
 190             },
 191             'tags': {
 192                 'epoch': Tag.objects.filter(category='epoch'),
 193                 'genre': Tag.objects.filter(category='genre'),
 194                 'kind': Tag.objects.filter(category='kind'),
 195             },
 196         })
 197
 198 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
 199     search = Search()
 200     results_parts = []
 201     search_fields = []
 202     words = query.split()
 203     fieldsets = (
 204         (['authors', 'authors_nonstem'], True),
 205         (['title', 'title_nonstem'], True),
 206         (['metadata', 'metadata_nonstem'], True),
 207         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
 208     )
 209     for fields, is_book in fieldsets:
 210         search_fields += fields
 211         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
 212     results = []
 213     ids_results = {}
 214     for results_part in results_parts:
 215         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
 216             book_id = result.book_id
 217             if book_id in ids_results:
 218                 ids_results[book_id].merge(result)
 219             else:
 220                 results.append(result)
 221                 ids_results[book_id] = result
 222     descendant_ids = set(
 223         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
 224     results = [result for result in results if result.book_id not in descendant_ids]
 225     for result in results:
 226         search.get_snippets(result, query, num=3)
 227
 228     def ensure_exists(r):
 229         try:
 230             r.book
 231         except Book.DoesNotExist:
 232             return False
 233
 234         print(lang, r.book.language)
 235         if lang and r.book.language != lang:
 236             return False
 237         if only_audio and not r.book.has_mp3_file():
 238             return False
 239         if only_synchro and not r.book.has_daisy_file():
 240             return False
 241         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
 242             return False
 243         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
 244             return False
 245         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
 246             return False
 247
 248         return True
 249
 250     results = [r for r in results if ensure_exists(r)]
 251     return results
 252
 253
 254 def search_pictures(query, epoch=None, kind=None, genre=None):
 255     search = Search()
 256     results_parts = []
 257     search_fields = []
 258     words = query.split()
 259     fieldsets = (
 260         (['authors', 'authors_nonstem'], True),
 261         (['title', 'title_nonstem'], True),
 262         (['metadata', 'metadata_nonstem'], True),
 263         (['themes_pl', 'themes_pl_nonstem'], False),
 264     )
 265     for fields, is_book in fieldsets:
 266         search_fields += fields
 267         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
 268     results = []
 269     ids_results = {}
 270     for results_part in results_parts:
 271         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
 272             picture_id = result.picture_id
 273             if picture_id in ids_results:
 274                 ids_results[picture_id].merge(result)
 275             else:
 276                 results.append(result)
 277                 ids_results[picture_id] = result
 278
 279     def ensure_exists(r):
 280         try:
 281             return r.picture
 282         except Picture.DoesNotExist:
 283             return False
 284
 285         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
 286             return False
 287         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
 288             return False
 289         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
 290             return False
 291
 292     results = [r for r in results if ensure_exists(r)]
 293     return results
 294
 295
 296 def search_pd_authors(query):
 297     pd_authors = Author.objects.filter(name__icontains=query)
 298     existing_slugs = Tag.objects.filter(
 299         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
 300         .values_list('slug', flat=True)
 301     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
 302     return pd_authors
 303
 304
 305 def prepare_query(query):
 306     query = ' '.join(query.split())
 307     # filter out private use characters
 308     import unicodedata
 309     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
 310     query = remove_query_syntax_chars(query)
 311
 312     words = query.split()
 313     if len(words) > 10:
 314         query = ' '.join(words[:10])
 315     return query