Add grammar to tags. Also: respect custom tag slugs on publishing.
[wolnelektury.git] / src / search / views.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from django.conf import settings
5 from django.http.response import HttpResponseRedirect
6 from django.shortcuts import render
7 from django.views.decorators import cache
8 from django.http import HttpResponse, JsonResponse
9
10 from catalogue.models import Book, Tag
11 from pdcounter.models import Author
12 from picture.models import Picture
13 from search.index import Search, SearchResult, PictureResult
14 from suggest.forms import PublishingSuggestForm
15 import re
16 import json
17
18 from wolnelektury.utils import re_escape
19
20
21 def match_word_re(word):
22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
23         return r"\b%s\b" % word
24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
25         return "[[:<:]]%s[[:>:]]" % word
26
27
28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
29
30
31 def remove_query_syntax_chars(query, replace=' '):
32     return query_syntax_chars.sub(replace, query)
33
34
35 def did_you_mean(query, tokens):
36     return query
37     # change = {}
38     # for t in tokens:
39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
40     #     if len(authors) > 0:
41     #         continue
42
43     #     if False:
44     #         if not dictionary.check(t):
45     #             try:
46     #                 change_to = dictionary.suggest(t)[0].lower()
47     #                 if change_to != t.lower():
48     #                     change[t] = change_to
49     #             except IndexError:
50     #                 pass
51
52     # if change == {}:
53     #     return None
54
55     # for frm, to in change.items():
56     #     query = query.replace(frm, to)
57
58     # return query
59
60
61 @cache.never_cache
62 def hint(request, mozhint=False, param='term'):
63     prefix = request.GET.get(param, '')
64     if len(prefix) < 2:
65         return JsonResponse([], safe=False)
66
67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
68
69     try:
70         limit = int(request.GET.get('max', ''))
71     except ValueError:
72         limit = 20
73     else:
74         if limit < 1:
75             limit = 20
76
77     authors = Tag.objects.filter(
78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
79     data = [
80         {
81             'label': author.name,
82             'id': author.id,
83             'url': author.get_absolute_url(),
84         }
85         for author in authors[:limit]
86     ]
87     if len(data) < limit:
88         data += [
89             {
90                 'label': b.title,
91                 'author': b.author_unicode(),
92                 'id': b.id,
93                 'url': b.get_absolute_url()
94             }
95             for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]
96         ]
97
98     if mozhint:
99         data = [
100             prefix,
101             [
102                 item['label']
103                 for item in data
104             ]
105         ]
106
107     callback = request.GET.get('callback', None)
108     if callback:
109         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
110                             content_type="application/json; charset=utf-8")
111     else:
112         return JsonResponse(data, safe=False)
113
114
115 @cache.never_cache
116 def main(request):
117     query = request.GET.get('q', '')
118
119     format = request.GET.get('format')
120     lang = request.GET.get('lang')
121     epoch = request.GET.get('epoch')
122     kind = request.GET.get('kind')
123     genre = request.GET.get('genre')
124
125     if len(query) < 2:
126         return render(
127             request, 'catalogue/search_too_short.html',
128             {'prefix': query})
129     elif len(query) > 256:
130         return render(
131             request, 'catalogue/search_too_long.html',
132             {'prefix': query})
133
134     query = prepare_query(query)
135     if not (format or lang or epoch or kind or genre):
136         pd_authors = search_pd_authors(query)
137     else:
138         pd_authors = []
139     if not format or format != 'obraz':
140         books = search_books(
141             query,
142             lang=lang,
143             only_audio=format=='audio',
144             only_synchro=format=='synchro',
145             epoch=epoch,
146             kind=kind,
147             genre=genre
148         )
149     else:
150         books = []
151     if (not format or format == 'obraz') and not lang:
152         pictures = search_pictures(
153             query,
154             epoch=epoch,
155             kind=kind,
156             genre=genre
157         )
158     else:
159         pictures = []
160     
161     suggestion = ''
162
163     if not (books or pictures or pd_authors):
164         form = PublishingSuggestForm(initial={"books": query + ", "})
165         return render(
166             request,
167             'catalogue/search_no_hits.html',
168             {
169                 'form': form,
170                 'did_you_mean': suggestion
171             })
172
173     if not (books or pictures) and len(pd_authors) == 1:
174         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
175
176     return render(
177         request,
178         'catalogue/search_multiple_hits.html',
179         {
180             'pd_authors': pd_authors,
181             'books': books,
182             'pictures': pictures,
183             'did_you_mean': suggestion,
184             'set': {
185                 'lang': lang,
186                 'format': format,
187                 'epoch': epoch,
188                 'kind': kind,
189                 'genre': genre,
190             },
191             'tags': {
192                 'epoch': Tag.objects.filter(category='epoch', for_books=True),
193                 'genre': Tag.objects.filter(category='genre', for_books=True),
194                 'kind': Tag.objects.filter(category='kind', for_books=True),
195             },
196         })
197
198 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
199     search = Search()
200     results_parts = []
201     search_fields = []
202     words = query.split()
203     fieldsets = (
204         (['authors', 'authors_nonstem'], True),
205         (['title', 'title_nonstem'], True),
206         (['metadata', 'metadata_nonstem'], True),
207         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
208     )
209     for fields, is_book in fieldsets:
210         search_fields += fields
211         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
212     results = []
213     ids_results = {}
214     for results_part in results_parts:
215         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
216             book_id = result.book_id
217             if book_id in ids_results:
218                 ids_results[book_id].merge(result)
219             else:
220                 results.append(result)
221                 ids_results[book_id] = result
222     descendant_ids = set(
223         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
224     results = [result for result in results if result.book_id not in descendant_ids]
225     for result in results:
226         search.get_snippets(result, query, num=3)
227
228     def ensure_exists(r):
229         try:
230             if not r.book:
231                 return False
232         except Book.DoesNotExist:
233             return False
234
235         if lang and r.book.language != lang:
236             return False
237         if only_audio and not r.book.has_mp3_file():
238             return False
239         if only_synchro and not r.book.has_daisy_file():
240             return False
241         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
242             return False
243         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
244             return False
245         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
246             return False
247
248         return True
249
250     results = [r for r in results if ensure_exists(r)]
251     return results
252
253
254 def search_pictures(query, epoch=None, kind=None, genre=None):
255     search = Search()
256     results_parts = []
257     search_fields = []
258     words = query.split()
259     fieldsets = (
260         (['authors', 'authors_nonstem'], True),
261         (['title', 'title_nonstem'], True),
262         (['metadata', 'metadata_nonstem'], True),
263         (['themes_pl', 'themes_pl_nonstem'], False),
264     )
265     for fields, is_book in fieldsets:
266         search_fields += fields
267         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
268     results = []
269     ids_results = {}
270     for results_part in results_parts:
271         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
272             picture_id = result.picture_id
273             if picture_id in ids_results:
274                 ids_results[picture_id].merge(result)
275             else:
276                 results.append(result)
277                 ids_results[picture_id] = result
278
279     def ensure_exists(r):
280         try:
281             if not r.picture:
282                 return False
283         except Picture.DoesNotExist:
284             return False
285
286         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
287             return False
288         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
289             return False
290         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
291             return False
292
293         return True
294
295     results = [r for r in results if ensure_exists(r)]
296     return results
297
298
299 def search_pd_authors(query):
300     pd_authors = Author.objects.filter(name__icontains=query)
301     existing_slugs = Tag.objects.filter(
302         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
303         .values_list('slug', flat=True)
304     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
305     return pd_authors
306
307
308 def prepare_query(query):
309     query = ' '.join(query.split())
310     # filter out private use characters
311     import unicodedata
312     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
313     query = remove_query_syntax_chars(query)
314
315     words = query.split()
316     if len(words) > 10:
317         query = ' '.join(words[:10])
318     return query