Searching, filtering fixes.
[wolnelektury.git] / src / search / views.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from django.conf import settings
5 from django.http.response import HttpResponseRedirect
6 from django.shortcuts import render
7 from django.views.decorators import cache
8 from django.http import HttpResponse, JsonResponse
9
10 from catalogue.models import Book, Tag
11 from pdcounter.models import Author
12 from picture.models import Picture
13 from search.index import Search, SearchResult, PictureResult
14 from .forms import SearchFilters
15 from suggest.forms import PublishingSuggestForm
16 import re
17 import json
18
19 from wolnelektury.utils import re_escape
20
21
22 def match_word_re(word):
23     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
24         return r"\b%s\b" % word
25     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
26         return "[[:<:]]%s[[:>:]]" % word
27
28
29 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
30
31
32 def remove_query_syntax_chars(query, replace=' '):
33     return query_syntax_chars.sub(replace, query)
34
35
36 def did_you_mean(query, tokens):
37     return query
38     # change = {}
39     # for t in tokens:
40     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
41     #     if len(authors) > 0:
42     #         continue
43
44     #     if False:
45     #         if not dictionary.check(t):
46     #             try:
47     #                 change_to = dictionary.suggest(t)[0].lower()
48     #                 if change_to != t.lower():
49     #                     change[t] = change_to
50     #             except IndexError:
51     #                 pass
52
53     # if change == {}:
54     #     return None
55
56     # for frm, to in change.items():
57     #     query = query.replace(frm, to)
58
59     # return query
60
61
62 @cache.never_cache
63 def hint(request, mozhint=False, param='term'):
64     prefix = request.GET.get(param, '')
65     if len(prefix) < 2:
66         return JsonResponse([], safe=False)
67
68     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
69
70     try:
71         limit = int(request.GET.get('max', ''))
72     except ValueError:
73         limit = 20
74     else:
75         if limit < 1:
76             limit = 20
77
78     authors = Tag.objects.filter(
79         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
80     data = [
81         {
82             'label': author.name,
83             'id': author.id,
84             'url': author.get_absolute_url(),
85         }
86         for author in authors[:limit]
87     ]
88     if len(data) < limit:
89         for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]:
90             author_str = b.author_unicode()
91             translator = b.translator()
92             if translator:
93                 author_str += ' (tłum. ' + translator + ')'
94             data.append(
95                 {
96                     'label': b.title,
97                     'author': author_str,
98                     'id': b.id,
99                     'url': b.get_absolute_url()
100                 }
101             )
102
103     if mozhint:
104         data = [
105             prefix,
106             [
107                 item['label']
108                 for item in data
109             ]
110         ]
111
112     callback = request.GET.get('callback', None)
113     if callback:
114         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
115                             content_type="application/json; charset=utf-8")
116     else:
117         return JsonResponse(data, safe=False)
118
119
120
121 @cache.never_cache
122 def search(request):
123     filters = SearchFilters(request.GET)
124     ctx = {
125         'title': 'Wynik wyszukiwania',
126         'query': filters.data['q'],
127         'filters': filters,
128     }
129     if filters.is_valid():
130         ctx['results'] = filters.results()
131         for k, v in ctx['results'].items():
132             if v:
133                 ctx['hasresults'] = True
134                 break
135     return render(request, 'search/results.html', ctx)
136
137
138 @cache.never_cache
139 def main(request):
140     if request.EXPERIMENTS['layout'].value:
141         return search(request)
142
143     query = request.GET.get('q', '')
144
145     format = request.GET.get('format')
146     lang = request.GET.get('lang')
147     epoch = request.GET.get('epoch')
148     kind = request.GET.get('kind')
149     genre = request.GET.get('genre')
150
151     if len(query) < 2:
152         return render(
153             request, 'catalogue/search_too_short.html',
154             {'prefix': query})
155     elif len(query) > 256:
156         return render(
157             request, 'catalogue/search_too_long.html',
158             {'prefix': query})
159
160     query = prepare_query(query)
161     if not (format or lang or epoch or kind or genre):
162         pd_authors = search_pd_authors(query)
163     else:
164         pd_authors = []
165     if not format or format != 'obraz':
166         books = search_books(
167             query,
168             lang=lang,
169             only_audio=format=='audio',
170             only_synchro=format=='synchro',
171             epoch=epoch,
172             kind=kind,
173             genre=genre
174         )
175     else:
176         books = []
177     if (not format or format == 'obraz') and not lang:
178         pictures = search_pictures(
179             query,
180             epoch=epoch,
181             kind=kind,
182             genre=genre
183         )
184     else:
185         pictures = []
186     
187     suggestion = ''
188
189     if not (books or pictures or pd_authors):
190         form = PublishingSuggestForm(initial={"books": query + ", "})
191         return render(
192             request,
193             'catalogue/search_no_hits.html',
194             {
195                 'form': form,
196                 'did_you_mean': suggestion
197             })
198
199     if not (books or pictures) and len(pd_authors) == 1:
200         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
201
202     return render(
203         request,
204         'catalogue/search_multiple_hits.html',
205         {
206             'pd_authors': pd_authors,
207             'books': books,
208             'pictures': pictures,
209             'did_you_mean': suggestion,
210             'set': {
211                 'lang': lang,
212                 'format': format,
213                 'epoch': epoch,
214                 'kind': kind,
215                 'genre': genre,
216             },
217             'tags': {
218                 'epoch': Tag.objects.filter(category='epoch', for_books=True),
219                 'genre': Tag.objects.filter(category='genre', for_books=True),
220                 'kind': Tag.objects.filter(category='kind', for_books=True),
221             },
222         })
223
224 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
225     search = Search()
226     results_parts = []
227     search_fields = []
228     words = query.split()
229     fieldsets = (
230         (['authors', 'authors_nonstem'], True),
231         (['title', 'title_nonstem'], True),
232         (['metadata', 'metadata_nonstem'], True),
233         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
234     )
235     for fields, is_book in fieldsets:
236         search_fields += fields
237         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
238     results = []
239     ids_results = {}
240     for results_part in results_parts:
241         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
242             book_id = result.book_id
243             if book_id in ids_results:
244                 ids_results[book_id].merge(result)
245             else:
246                 results.append(result)
247                 ids_results[book_id] = result
248     descendant_ids = set(
249         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
250     results = [result for result in results if result.book_id not in descendant_ids]
251     for result in results:
252         search.get_snippets(result, query, num=3)
253
254     def ensure_exists(r):
255         try:
256             if not r.book:
257                 return False
258         except Book.DoesNotExist:
259             return False
260
261         if lang and r.book.language != lang:
262             return False
263         if only_audio and not r.book.has_mp3_file():
264             return False
265         if only_synchro and not r.book.has_daisy_file():
266             return False
267         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
268             return False
269         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
270             return False
271         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
272             return False
273
274         return True
275
276     results = [r for r in results if ensure_exists(r)]
277     return results
278
279
280 def search_pictures(query, epoch=None, kind=None, genre=None):
281     search = Search()
282     results_parts = []
283     search_fields = []
284     words = query.split()
285     fieldsets = (
286         (['authors', 'authors_nonstem'], True),
287         (['title', 'title_nonstem'], True),
288         (['metadata', 'metadata_nonstem'], True),
289         (['themes_pl', 'themes_pl_nonstem'], False),
290     )
291     for fields, is_book in fieldsets:
292         search_fields += fields
293         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
294     results = []
295     ids_results = {}
296     for results_part in results_parts:
297         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
298             picture_id = result.picture_id
299             if picture_id in ids_results:
300                 ids_results[picture_id].merge(result)
301             else:
302                 results.append(result)
303                 ids_results[picture_id] = result
304
305     def ensure_exists(r):
306         try:
307             if not r.picture:
308                 return False
309         except Picture.DoesNotExist:
310             return False
311
312         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
313             return False
314         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
315             return False
316         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
317             return False
318
319         return True
320
321     results = [r for r in results if ensure_exists(r)]
322     return results
323
324
325 def search_pd_authors(query):
326     pd_authors = Author.objects.filter(name__icontains=query)
327     existing_slugs = Tag.objects.filter(
328         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
329         .values_list('slug', flat=True)
330     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
331     return pd_authors
332
333
334 def prepare_query(query):
335     query = ' '.join(query.split())
336     # filter out private use characters
337     import unicodedata
338     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
339     query = remove_query_syntax_chars(query)
340
341     words = query.split()
342     if len(words) > 10:
343         query = ' '.join(words[:10])
344     return query