5acbffa8591704c255f1dc3905fc2f66e3491a3c
[wolnelektury.git] / src / search / views.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from django.conf import settings
5 from django.http.response import HttpResponseRedirect
6 from django.shortcuts import render
7 from django.views.decorators import cache
8 from django.http import HttpResponse, JsonResponse
9
10 from catalogue.models import Book, Tag
11 from pdcounter.models import Author
12 from picture.models import Picture
13 from search.index import Search, SearchResult, PictureResult
14 from suggest.forms import PublishingSuggestForm
15 import re
16 import json
17
18 from wolnelektury.utils import re_escape
19
20
21 def match_word_re(word):
22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
23         return r"\b%s\b" % word
24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
25         return "[[:<:]]%s[[:>:]]" % word
26
27
28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
29
30
31 def remove_query_syntax_chars(query, replace=' '):
32     return query_syntax_chars.sub(replace, query)
33
34
35 def did_you_mean(query, tokens):
36     return query
37     # change = {}
38     # for t in tokens:
39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
40     #     if len(authors) > 0:
41     #         continue
42
43     #     if False:
44     #         if not dictionary.check(t):
45     #             try:
46     #                 change_to = dictionary.suggest(t)[0].lower()
47     #                 if change_to != t.lower():
48     #                     change[t] = change_to
49     #             except IndexError:
50     #                 pass
51
52     # if change == {}:
53     #     return None
54
55     # for frm, to in change.items():
56     #     query = query.replace(frm, to)
57
58     # return query
59
60
61 @cache.never_cache
62 def hint(request, mozhint=False, param='term'):
63     prefix = request.GET.get(param, '')
64     if len(prefix) < 2:
65         return JsonResponse([], safe=False)
66
67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
68
69     try:
70         limit = int(request.GET.get('max', ''))
71     except ValueError:
72         limit = 20
73     else:
74         if limit < 1:
75             limit = 20
76
77     authors = Tag.objects.filter(
78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
79     data = [
80         {
81             'label': author.name,
82             'id': author.id,
83             'url': author.get_absolute_url(),
84         }
85         for author in authors[:limit]
86     ]
87     if len(data) < limit:
88         for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]:
89             author_str = b.author_unicode()
90             translator = b.translator()
91             if translator:
92                 author_str += ' (tłum. ' + translator + ')'
93             data.append(
94                 {
95                     'label': b.title,
96                     'author': author_str,
97                     'id': b.id,
98                     'url': b.get_absolute_url()
99                 }
100             )
101
102     if mozhint:
103         data = [
104             prefix,
105             [
106                 item['label']
107                 for item in data
108             ]
109         ]
110
111     callback = request.GET.get('callback', None)
112     if callback:
113         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
114                             content_type="application/json; charset=utf-8")
115     else:
116         return JsonResponse(data, safe=False)
117
118
119 @cache.never_cache
120 def main(request):
121     query = request.GET.get('q', '')
122
123     format = request.GET.get('format')
124     lang = request.GET.get('lang')
125     epoch = request.GET.get('epoch')
126     kind = request.GET.get('kind')
127     genre = request.GET.get('genre')
128
129     if len(query) < 2:
130         return render(
131             request, 'catalogue/search_too_short.html',
132             {'prefix': query})
133     elif len(query) > 256:
134         return render(
135             request, 'catalogue/search_too_long.html',
136             {'prefix': query})
137
138     query = prepare_query(query)
139     if not (format or lang or epoch or kind or genre):
140         pd_authors = search_pd_authors(query)
141     else:
142         pd_authors = []
143     if not format or format != 'obraz':
144         books = search_books(
145             query,
146             lang=lang,
147             only_audio=format=='audio',
148             only_synchro=format=='synchro',
149             epoch=epoch,
150             kind=kind,
151             genre=genre
152         )
153     else:
154         books = []
155     if (not format or format == 'obraz') and not lang:
156         pictures = search_pictures(
157             query,
158             epoch=epoch,
159             kind=kind,
160             genre=genre
161         )
162     else:
163         pictures = []
164     
165     suggestion = ''
166
167     if not (books or pictures or pd_authors):
168         form = PublishingSuggestForm(initial={"books": query + ", "})
169         return render(
170             request,
171             'catalogue/search_no_hits.html',
172             {
173                 'form': form,
174                 'did_you_mean': suggestion
175             })
176
177     if not (books or pictures) and len(pd_authors) == 1:
178         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
179
180     return render(
181         request,
182         'catalogue/search_multiple_hits.html',
183         {
184             'pd_authors': pd_authors,
185             'books': books,
186             'pictures': pictures,
187             'did_you_mean': suggestion,
188             'set': {
189                 'lang': lang,
190                 'format': format,
191                 'epoch': epoch,
192                 'kind': kind,
193                 'genre': genre,
194             },
195             'tags': {
196                 'epoch': Tag.objects.filter(category='epoch', for_books=True),
197                 'genre': Tag.objects.filter(category='genre', for_books=True),
198                 'kind': Tag.objects.filter(category='kind', for_books=True),
199             },
200         })
201
202 def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
203     search = Search()
204     results_parts = []
205     search_fields = []
206     words = query.split()
207     fieldsets = (
208         (['authors', 'authors_nonstem'], True),
209         (['title', 'title_nonstem'], True),
210         (['metadata', 'metadata_nonstem'], True),
211         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
212     )
213     for fields, is_book in fieldsets:
214         search_fields += fields
215         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
216     results = []
217     ids_results = {}
218     for results_part in results_parts:
219         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
220             book_id = result.book_id
221             if book_id in ids_results:
222                 ids_results[book_id].merge(result)
223             else:
224                 results.append(result)
225                 ids_results[book_id] = result
226     descendant_ids = set(
227         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
228     results = [result for result in results if result.book_id not in descendant_ids]
229     for result in results:
230         search.get_snippets(result, query, num=3)
231
232     def ensure_exists(r):
233         try:
234             if not r.book:
235                 return False
236         except Book.DoesNotExist:
237             return False
238
239         if lang and r.book.language != lang:
240             return False
241         if only_audio and not r.book.has_mp3_file():
242             return False
243         if only_synchro and not r.book.has_daisy_file():
244             return False
245         if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
246             return False
247         if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
248             return False
249         if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
250             return False
251
252         return True
253
254     results = [r for r in results if ensure_exists(r)]
255     return results
256
257
258 def search_pictures(query, epoch=None, kind=None, genre=None):
259     search = Search()
260     results_parts = []
261     search_fields = []
262     words = query.split()
263     fieldsets = (
264         (['authors', 'authors_nonstem'], True),
265         (['title', 'title_nonstem'], True),
266         (['metadata', 'metadata_nonstem'], True),
267         (['themes_pl', 'themes_pl_nonstem'], False),
268     )
269     for fields, is_book in fieldsets:
270         search_fields += fields
271         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
272     results = []
273     ids_results = {}
274     for results_part in results_parts:
275         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
276             picture_id = result.picture_id
277             if picture_id in ids_results:
278                 ids_results[picture_id].merge(result)
279             else:
280                 results.append(result)
281                 ids_results[picture_id] = result
282
283     def ensure_exists(r):
284         try:
285             if not r.picture:
286                 return False
287         except Picture.DoesNotExist:
288             return False
289
290         if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
291             return False
292         if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
293             return False
294         if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
295             return False
296
297         return True
298
299     results = [r for r in results if ensure_exists(r)]
300     return results
301
302
303 def search_pd_authors(query):
304     pd_authors = Author.objects.filter(name__icontains=query)
305     existing_slugs = Tag.objects.filter(
306         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
307         .values_list('slug', flat=True)
308     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
309     return pd_authors
310
311
312 def prepare_query(query):
313     query = ' '.join(query.split())
314     # filter out private use characters
315     import unicodedata
316     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
317     query = remove_query_syntax_chars(query)
318
319     words = query.split()
320     if len(words) > 10:
321         query = ' '.join(words[:10])
322     return query