Upgrade django-pipeline to fix conflict with sentry-sdk.
[wolnelektury.git] / src / search / views.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from django.conf import settings
5 from django.http.response import HttpResponseRedirect
6 from django.shortcuts import render
7 from django.views.decorators import cache
8 from django.http import HttpResponse, JsonResponse
9
10 from catalogue.models import Book, Tag
11 from pdcounter.models import Author
12 from picture.models import Picture
13 from search.index import Search, SearchResult, PictureResult
14 from suggest.forms import PublishingSuggestForm
15 import re
16 import json
17
18 from wolnelektury.utils import re_escape
19
20
21 def match_word_re(word):
22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
23         return r"\b%s\b" % word
24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
25         return "[[:<:]]%s[[:>:]]" % word
26
27
28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
29
30
31 def remove_query_syntax_chars(query, replace=' '):
32     return query_syntax_chars.sub(replace, query)
33
34
35 def did_you_mean(query, tokens):
36     return query
37     # change = {}
38     # for t in tokens:
39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
40     #     if len(authors) > 0:
41     #         continue
42
43     #     if False:
44     #         if not dictionary.check(t):
45     #             try:
46     #                 change_to = dictionary.suggest(t)[0].lower()
47     #                 if change_to != t.lower():
48     #                     change[t] = change_to
49     #             except IndexError:
50     #                 pass
51
52     # if change == {}:
53     #     return None
54
55     # for frm, to in change.items():
56     #     query = query.replace(frm, to)
57
58     # return query
59
60
61 @cache.never_cache
62 def hint(request):
63     prefix = request.GET.get('term', '')
64     if len(prefix) < 2:
65         return JsonResponse([], safe=False)
66
67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
68
69     try:
70         limit = int(request.GET.get('max', ''))
71     except ValueError:
72         limit = 20
73     else:
74         if limit < 1:
75             limit = 20
76
77     authors = Tag.objects.filter(
78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
79     data = [
80         {
81             'label': author.name,
82             'id': author.id,
83             'url': author.get_absolute_url(),
84         }
85         for author in authors[:limit]
86     ]
87     if len(data) < limit:
88         data += [
89             {
90                 'label': b.title,
91                 'author': b.author_unicode(),
92                 'id': b.id,
93                 'url': b.get_absolute_url()
94             }
95             for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
96         ]
97     callback = request.GET.get('callback', None)
98     if callback:
99         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
100                             content_type="application/json; charset=utf-8")
101     else:
102         return JsonResponse(data, safe=False)
103
104
105 @cache.never_cache
106 def main(request):
107     query = request.GET.get('q', '')
108     if len(query) < 2:
109         return render(
110             request, 'catalogue/search_too_short.html',
111             {'prefix': query})
112     elif len(query) > 256:
113         return render(
114             request, 'catalogue/search_too_long.html',
115             {'prefix': query})
116
117     query = prepare_query(query)
118     pd_authors = search_pd_authors(query)
119     books = search_books(query)
120     pictures = search_pictures(query)
121     suggestion = ''
122
123     if not (books or pictures or pd_authors):
124         form = PublishingSuggestForm(initial={"books": query + ", "})
125         return render(
126             request,
127             'catalogue/search_no_hits.html',
128             {
129                 'form': form,
130                 'did_you_mean': suggestion
131             })
132
133     if not (books or pictures) and len(pd_authors) == 1:
134         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
135
136     return render(
137         request,
138         'catalogue/search_multiple_hits.html',
139         {
140             'pd_authors': pd_authors,
141             'books': books,
142             'pictures': pictures,
143             'did_you_mean': suggestion
144         })
145
146 def search_books(query):
147     search = Search()
148     results_parts = []
149     search_fields = []
150     words = query.split()
151     fieldsets = (
152         (['authors', 'authors_nonstem'], True),
153         (['title', 'title_nonstem'], True),
154         (['metadata', 'metadata_nonstem'], True),
155         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
156     )
157     for fields, is_book in fieldsets:
158         search_fields += fields
159         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
160     results = []
161     ids_results = {}
162     for results_part in results_parts:
163         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
164             book_id = result.book_id
165             if book_id in ids_results:
166                 ids_results[book_id].merge(result)
167             else:
168                 results.append(result)
169                 ids_results[book_id] = result
170     descendant_ids = set(
171         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
172     results = [result for result in results if result.book_id not in descendant_ids]
173     for result in results:
174         search.get_snippets(result, query, num=3)
175
176     def ensure_exists(r):
177         try:
178             return r.book
179         except Book.DoesNotExist:
180             return False
181
182     results = filter(ensure_exists, results)
183     return results
184
185
186 def search_pictures(query):
187     search = Search()
188     results_parts = []
189     search_fields = []
190     words = query.split()
191     fieldsets = (
192         (['authors', 'authors_nonstem'], True),
193         (['title', 'title_nonstem'], True),
194         (['metadata', 'metadata_nonstem'], True),
195         (['themes_pl', 'themes_pl_nonstem'], False),
196     )
197     for fields, is_book in fieldsets:
198         search_fields += fields
199         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
200     results = []
201     ids_results = {}
202     for results_part in results_parts:
203         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
204             picture_id = result.picture_id
205             if picture_id in ids_results:
206                 ids_results[picture_id].merge(result)
207             else:
208                 results.append(result)
209                 ids_results[picture_id] = result
210
211     def ensure_exists(r):
212         try:
213             return r.picture
214         except Picture.DoesNotExist:
215             return False
216
217     results = filter(ensure_exists, results)
218     return results
219
220
221 def search_pd_authors(query):
222     pd_authors = Author.objects.filter(name__icontains=query)
223     existing_slugs = Tag.objects.filter(
224         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
225         .values_list('slug', flat=True)
226     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
227     return pd_authors
228
229
230 def prepare_query(query):
231     query = ' '.join(query.split())
232     # filter out private use characters
233     import unicodedata
234     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
235     query = remove_query_syntax_chars(query)
236
237     words = query.split()
238     if len(words) > 10:
239         query = ' '.join(words[:10])
240     return query