Remove a word.
[wolnelektury.git] / src / search / views.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from django.conf import settings
5 from django.http.response import HttpResponseRedirect
6 from django.shortcuts import render
7 from django.views.decorators import cache
8 from django.http import HttpResponse, JsonResponse
9
10 from catalogue.models import Book, Tag
11 from pdcounter.models import Author
12 from picture.models import Picture
13 from search.index import Search, SearchResult, PictureResult
14 from suggest.forms import PublishingSuggestForm
15 import re
16 import json
17
18 from wolnelektury.utils import re_escape
19
20
21 def match_word_re(word):
22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
23         return r"\b%s\b" % word
24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
25         return "[[:<:]]%s[[:>:]]" % word
26
27
28 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
29
30
31 def remove_query_syntax_chars(query, replace=' '):
32     return query_syntax_chars.sub(replace, query)
33
34
35 def did_you_mean(query, tokens):
36     return query
37     # change = {}
38     # for t in tokens:
39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
40     #     if len(authors) > 0:
41     #         continue
42
43     #     if False:
44     #         if not dictionary.check(t):
45     #             try:
46     #                 change_to = dictionary.suggest(t)[0].lower()
47     #                 if change_to != t.lower():
48     #                     change[t] = change_to
49     #             except IndexError:
50     #                 pass
51
52     # if change == {}:
53     #     return None
54
55     # for frm, to in change.items():
56     #     query = query.replace(frm, to)
57
58     # return query
59
60
61 @cache.never_cache
62 def hint(request, mozhint=False, param='term'):
63     prefix = request.GET.get(param, '')
64     if len(prefix) < 2:
65         return JsonResponse([], safe=False)
66
67     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
68
69     try:
70         limit = int(request.GET.get('max', ''))
71     except ValueError:
72         limit = 20
73     else:
74         if limit < 1:
75             limit = 20
76
77     authors = Tag.objects.filter(
78         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
79     data = [
80         {
81             'label': author.name,
82             'id': author.id,
83             'url': author.get_absolute_url(),
84         }
85         for author in authors[:limit]
86     ]
87     if len(data) < limit:
88         data += [
89             {
90                 'label': b.title,
91                 'author': b.author_unicode(),
92                 'id': b.id,
93                 'url': b.get_absolute_url()
94             }
95             for b in Book.objects.filter(findable=True, title__iregex='\m' + prefix)[:limit-len(data)]
96         ]
97
98     if mozhint:
99         data = [
100             prefix,
101             [
102                 item['label']
103                 for item in data
104             ]
105         ]
106
107     callback = request.GET.get('callback', None)
108     if callback:
109         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
110                             content_type="application/json; charset=utf-8")
111     else:
112         return JsonResponse(data, safe=False)
113
114
115 @cache.never_cache
116 def main(request):
117     query = request.GET.get('q', '')
118     if len(query) < 2:
119         return render(
120             request, 'catalogue/search_too_short.html',
121             {'prefix': query})
122     elif len(query) > 256:
123         return render(
124             request, 'catalogue/search_too_long.html',
125             {'prefix': query})
126
127     query = prepare_query(query)
128     pd_authors = search_pd_authors(query)
129     books = search_books(query)
130     pictures = search_pictures(query)
131     suggestion = ''
132
133     if not (books or pictures or pd_authors):
134         form = PublishingSuggestForm(initial={"books": query + ", "})
135         return render(
136             request,
137             'catalogue/search_no_hits.html',
138             {
139                 'form': form,
140                 'did_you_mean': suggestion
141             })
142
143     if not (books or pictures) and len(pd_authors) == 1:
144         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
145
146     return render(
147         request,
148         'catalogue/search_multiple_hits.html',
149         {
150             'pd_authors': pd_authors,
151             'books': books,
152             'pictures': pictures,
153             'did_you_mean': suggestion
154         })
155
156 def search_books(query):
157     search = Search()
158     results_parts = []
159     search_fields = []
160     words = query.split()
161     fieldsets = (
162         (['authors', 'authors_nonstem'], True),
163         (['title', 'title_nonstem'], True),
164         (['metadata', 'metadata_nonstem'], True),
165         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
166     )
167     for fields, is_book in fieldsets:
168         search_fields += fields
169         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
170     results = []
171     ids_results = {}
172     for results_part in results_parts:
173         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
174             book_id = result.book_id
175             if book_id in ids_results:
176                 ids_results[book_id].merge(result)
177             else:
178                 results.append(result)
179                 ids_results[book_id] = result
180     descendant_ids = set(
181         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
182     results = [result for result in results if result.book_id not in descendant_ids]
183     for result in results:
184         search.get_snippets(result, query, num=3)
185
186     def ensure_exists(r):
187         try:
188             return r.book
189         except Book.DoesNotExist:
190             return False
191
192     results = [r for r in results if ensure_exists(r)]
193     return results
194
195
196 def search_pictures(query):
197     search = Search()
198     results_parts = []
199     search_fields = []
200     words = query.split()
201     fieldsets = (
202         (['authors', 'authors_nonstem'], True),
203         (['title', 'title_nonstem'], True),
204         (['metadata', 'metadata_nonstem'], True),
205         (['themes_pl', 'themes_pl_nonstem'], False),
206     )
207     for fields, is_book in fieldsets:
208         search_fields += fields
209         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
210     results = []
211     ids_results = {}
212     for results_part in results_parts:
213         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
214             picture_id = result.picture_id
215             if picture_id in ids_results:
216                 ids_results[picture_id].merge(result)
217             else:
218                 results.append(result)
219                 ids_results[picture_id] = result
220
221     def ensure_exists(r):
222         try:
223             return r.picture
224         except Picture.DoesNotExist:
225             return False
226
227     results = [r for r in results if ensure_exists(r)]
228     return results
229
230
231 def search_pd_authors(query):
232     pd_authors = Author.objects.filter(name__icontains=query)
233     existing_slugs = Tag.objects.filter(
234         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
235         .values_list('slug', flat=True)
236     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
237     return pd_authors
238
239
240 def prepare_query(query):
241     query = ' '.join(query.split())
242     # filter out private use characters
243     import unicodedata
244     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
245     query = remove_query_syntax_chars(query)
246
247     words = query.split()
248     if len(words) > 10:
249         query = ' '.join(words[:10])
250     return query