search fix
[wolnelektury.git] / src / search / views.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from django.conf import settings
5 from django.http.response import HttpResponseRedirect
6 from django.shortcuts import render
7 from django.views.decorators import cache
8 from django.http import HttpResponse, JsonResponse
9
10 from catalogue.models import Book, Tag
11 from pdcounter.models import Author
12 from picture.models import Picture
13 from search.index import Search, SearchResult, PictureResult
14 from .forms import SearchFilters
15 from suggest.forms import PublishingSuggestForm
16 import re
17 import json
18
19 from wolnelektury.utils import re_escape
20
21
22 def match_word_re(word):
23     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
24         return r"\b%s\b" % word
25     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
26         return "[[:<:]]%s[[:>:]]" % word
27
28
29 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
30
31
32 def remove_query_syntax_chars(query, replace=' '):
33     return query_syntax_chars.sub(replace, query)
34
35
36 def did_you_mean(query, tokens):
37     return query
38     # change = {}
39     # for t in tokens:
40     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
41     #     if len(authors) > 0:
42     #         continue
43
44     #     if False:
45     #         if not dictionary.check(t):
46     #             try:
47     #                 change_to = dictionary.suggest(t)[0].lower()
48     #                 if change_to != t.lower():
49     #                     change[t] = change_to
50     #             except IndexError:
51     #                 pass
52
53     # if change == {}:
54     #     return None
55
56     # for frm, to in change.items():
57     #     query = query.replace(frm, to)
58
59     # return query
60
61
62 @cache.never_cache
63 def hint(request, mozhint=False, param='term'):
64     prefix = request.GET.get(param, '')
65     if len(prefix) < 2:
66         return JsonResponse([], safe=False)
67
68     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
69
70     try:
71         limit = int(request.GET.get('max', ''))
72     except ValueError:
73         limit = 20
74     else:
75         if limit < 1:
76             limit = 20
77
78     authors = Tag.objects.filter(
79         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
80     data = [
81         {
82             'label': author.name,
83             'id': author.id,
84             'url': author.get_absolute_url(),
85         }
86         for author in authors[:limit]
87     ]
88     if len(data) < limit:
89         for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]:
90             author_str = b.author_unicode()
91             translator = b.translator()
92             if translator:
93                 author_str += ' (tłum. ' + translator + ')'
94             data.append(
95                 {
96                     'label': b.title,
97                     'author': author_str,
98                     'id': b.id,
99                     'url': b.get_absolute_url()
100                 }
101             )
102
103     if mozhint:
104         data = [
105             prefix,
106             [
107                 item['label']
108                 for item in data
109             ]
110         ]
111
112     callback = request.GET.get('callback', None)
113     if callback:
114         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
115                             content_type="application/json; charset=utf-8")
116     else:
117         return JsonResponse(data, safe=False)
118
119
120
121 @cache.never_cache
122 def search(request):
123     filters = SearchFilters(request.GET)
124     ctx = {
125         'title': 'Wynik wyszukiwania',
126         'query': filters.data['q'],
127         'filters': filters,
128     }
129     if filters.is_valid():
130         ctx['results'] = filters.results()
131         for k, v in ctx['results'].items():
132             if v:
133                 ctx['hasresults'] = True
134                 break
135     return render(request, 'search/results.html', ctx)
136
137
138 @cache.never_cache
139 def main(request):
140     if request.EXPERIMENTS['search'].value:
141         request.EXPERIMENTS['layout'].override(True)
142         return search(request)
143
144     query = request.GET.get('q', '')
145
146     format = request.GET.get('format')
147     lang = request.GET.get('lang')
148     epoch = request.GET.get('epoch')
149     kind = request.GET.get('kind')
150     genre = request.GET.get('genre')
151
152     if len(query) < 2:
153         return render(
154             request, 'catalogue/search_too_short.html',
155             {'prefix': query})
156     elif len(query) > 256:
157         return render(
158             request, 'catalogue/search_too_long.html',
159             {'prefix': query})
160
161     query = prepare_query(query)
162     if not (format or lang or epoch or kind or genre):
163         pd_authors = search_pd_authors(query)
164     else:
165         pd_authors = []
166     if not format or format != 'obraz':
167         books = search_books(
168             query,
169             lang=lang,
170             only_audio=format=='audio',
171             only_synchro=format=='synchro',
172             epoch=epoch,
173             kind=kind,
174             genre=genre
175         )
176     else:
177         books = []
178     if (not format or format == 'obraz') and not lang:
179         pictures = search_pictures(
180             query,
181             epoch=epoch,
182             kind=kind,
183             genre=genre
184         )
185     else:
186         pictures = []
187     
188     suggestion = ''
189
190     if not (books or pictures or pd_authors):
191         form = PublishingSuggestForm(initial={"books": query + ", "})
192         return render(
193             request,
194             'catalogue/search_no_hits.html',
195             {
196                 'form': form,
197                 'did_you_mean': suggestion
198             })
199
200     if not (books or pictures) and len(pd_authors) == 1:
201         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
202
203     return render(
204         request,
205         'catalogue/search_multiple_hits.html',
206         {
207             'pd_authors': pd_authors,
208             'books': books,
209             'pictures': pictures,
210             'did_you_mean': suggestion,
211             'set': {
212                 'lang': lang,
213                 'format': format,
214                 'epoch': epoch,
215                 'kind': kind,
216                 'genre': genre,
217             },
218             'tags': {
219                 'epoch': Tag.objects.filter(category='epoch', for_books=True),
220                 'genre': Tag.objects.filter(category='genre', for_books=True),
221                 'kind': Tag.objects.filter(category='kind', for_books=True),
222             },
223         })
224
225 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
226     search = Search()
227     results_parts = []
228     search_fields = []
229     words = query.split()
230     fieldsets = (
231         (['authors', 'authors_nonstem'], True),
232         (['title', 'title_nonstem'], True),
233         (['metadata', 'metadata_nonstem'], True),
234         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
235     )
236     for fields, is_book in fieldsets:
237         search_fields += fields
238         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
239     results = []
240     ids_results = {}
241     for results_part in results_parts:
242         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
243             book_id = result.book_id
244             if book_id in ids_results:
245                 ids_results[book_id].merge(result)
246             else:
247                 results.append(result)
248                 ids_results[book_id] = result
249     descendant_ids = set(
250         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
251     results = [result for result in results if result.book_id not in descendant_ids]
252     for result in results:
253         search.get_snippets(result, query, num=3)
254
255     def ensure_exists(r):
256         try:
257             if not r.book:
258                 return False
259         except Book.DoesNotExist:
260             return False
261
262         if lang and r.book.language != lang:
263             return False
264         if only_audio and not r.book.has_mp3_file():
265             return False
266         if only_synchro and not r.book.has_daisy_file():
267             return False
268         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
269             return False
270         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
271             return False
272         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
273             return False
274
275         return True
276
277     results = [r for r in results if ensure_exists(r)]
278     return results
279
280
281 def search_pictures(query, epoch=None, kind=None, genre=None):
282     search = Search()
283     results_parts = []
284     search_fields = []
285     words = query.split()
286     fieldsets = (
287         (['authors', 'authors_nonstem'], True),
288         (['title', 'title_nonstem'], True),
289         (['metadata', 'metadata_nonstem'], True),
290         (['themes_pl', 'themes_pl_nonstem'], False),
291     )
292     for fields, is_book in fieldsets:
293         search_fields += fields
294         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
295     results = []
296     ids_results = {}
297     for results_part in results_parts:
298         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
299             picture_id = result.picture_id
300             if picture_id in ids_results:
301                 ids_results[picture_id].merge(result)
302             else:
303                 results.append(result)
304                 ids_results[picture_id] = result
305
306     def ensure_exists(r):
307         try:
308             if not r.picture:
309                 return False
310         except Picture.DoesNotExist:
311             return False
312
313         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
314             return False
315         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
316             return False
317         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
318             return False
319
320         return True
321
322     results = [r for r in results if ensure_exists(r)]
323     return results
324
325
326 def search_pd_authors(query):
327     pd_authors = Author.objects.filter(name__icontains=query)
328     existing_slugs = Tag.objects.filter(
329         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
330         .values_list('slug', flat=True)
331     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
332     return pd_authors
333
334
335 def prepare_query(query):
336     query = ' '.join(query.split())
337     # filter out private use characters
338     import unicodedata
339     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
340     query = remove_query_syntax_chars(query)
341
342     words = query.split()
343     if len(words) > 10:
344         query = ' '.join(words[:10])
345     return query