better treatment to lack of diacritics in search
[wolnelektury.git] / src / search / views.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6 from django.http.response import HttpResponseRedirect
7 from django.shortcuts import render_to_response
8 from django.template import RequestContext
9 from django.views.decorators import cache
10 from django.http import HttpResponse, JsonResponse
11
12 from catalogue.models import Book, Tag
13 from pdcounter.models import Author
14 from picture.models import Picture
15 from search.index import Search, SearchResult, PictureResult
16 from suggest.forms import PublishingSuggestForm
17 import re
18 import json
19
20 from wolnelektury.utils import re_escape
21
22
23 def match_word_re(word):
24     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
25         return r"\b%s\b" % word
26     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
27         return "[[:<:]]%s[[:>:]]" % word
28
29
30 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
31
32
33 def remove_query_syntax_chars(query, replace=' '):
34     return query_syntax_chars.sub(replace, query)
35
36
37 def did_you_mean(query, tokens):
38     return query
39     # change = {}
40     # for t in tokens:
41     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
42     #     if len(authors) > 0:
43     #         continue
44
45     #     if False:
46     #         if not dictionary.check(t):
47     #             try:
48     #                 change_to = dictionary.suggest(t)[0].lower()
49     #                 if change_to != t.lower():
50     #                     change[t] = change_to
51     #             except IndexError:
52     #                 pass
53
54     # if change == {}:
55     #     return None
56
57     # for frm, to in change.items():
58     #     query = query.replace(frm, to)
59
60     # return query
61
62
63 @cache.never_cache
64 def hint(request):
65     prefix = request.GET.get('term', '')
66     if len(prefix) < 2:
67         return JsonResponse([], safe=False)
68
69     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
70
71     try:
72         limit = int(request.GET.get('max', ''))
73     except ValueError:
74         limit = 20
75     else:
76         if limit < 1:
77             limit = 20
78
79     authors = Tag.objects.filter(
80         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
81     data = [
82         {
83             'label': author.name,
84             'id': author.id,
85             'url': author.get_absolute_url(),
86         }
87         for author in authors[:limit]
88     ]
89     if len(data) < limit:
90         data += [
91             {
92                 'label': b.title,
93                 'author': b.author_unicode(),
94                 'id': b.id,
95                 'url': b.get_absolute_url()
96             }
97             for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
98         ]
99     callback = request.GET.get('callback', None)
100     if callback:
101         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
102                             content_type="application/json; charset=utf-8")
103     else:
104         return JsonResponse(data, safe=False)
105
106
107 @cache.never_cache
108 def main(request):
109     query = request.GET.get('q', '')
110     if len(query) < 2:
111         return render_to_response(
112             'catalogue/search_too_short.html', {'prefix': query},
113             context_instance=RequestContext(request))
114     elif len(query) > 256:
115         return render_to_response(
116             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
117
118     query = prepare_query(query)
119     pd_authors = search_pd_authors(query)
120     books = search_books(query)
121     pictures = search_pictures(query)
122     suggestion = u''
123
124     if not (books or pictures or pd_authors):
125         form = PublishingSuggestForm(initial={"books": query + ", "})
126         return render_to_response(
127             'catalogue/search_no_hits.html',
128             {
129                 'form': form,
130                 'did_you_mean': suggestion
131             },
132             context_instance=RequestContext(request))
133
134     if not (books or pictures) and len(pd_authors) == 1:
135         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
136
137     return render_to_response(
138         'catalogue/search_multiple_hits.html',
139         {
140             'pd_authors': pd_authors,
141             'books': books,
142             'pictures': pictures,
143             'did_you_mean': suggestion
144         },
145         context_instance=RequestContext(request))
146
147
148 def search_books(query):
149     search = Search()
150     results_parts = []
151     search_fields = []
152     words = query.split()
153     fieldsets = (
154         (['authors', 'authors_nonstem'], True),
155         (['title', 'title_nonstem'], True),
156         (['metadata', 'metadata_nonstem'], True),
157         (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
158     )
159     for fields, is_book in fieldsets:
160         search_fields += fields
161         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
162     results = []
163     ids_results = {}
164     for results_part in results_parts:
165         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
166             book_id = result.book_id
167             if book_id in ids_results:
168                 ids_results[book_id].merge(result)
169             else:
170                 results.append(result)
171                 ids_results[book_id] = result
172     descendant_ids = set(
173         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
174     results = [result for result in results if result.book_id not in descendant_ids]
175     for result in results:
176         search.get_snippets(result, query, num=3)
177
178     def ensure_exists(r):
179         try:
180             return r.book
181         except Book.DoesNotExist:
182             return False
183
184     results = filter(ensure_exists, results)
185     return results
186
187
188 def search_pictures(query):
189     search = Search()
190     results_parts = []
191     search_fields = []
192     words = query.split()
193     fieldsets = (
194         (['authors', 'authors_nonstem'], True),
195         (['title', 'title_nonstem'], True),
196         (['metadata', 'metadata_nonstem'], True),
197         (['themes_pl', 'themes_pl_nonstem'], False),
198     )
199     for fields, is_book in fieldsets:
200         search_fields += fields
201         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
202     results = []
203     ids_results = {}
204     for results_part in results_parts:
205         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
206             picture_id = result.picture_id
207             if picture_id in ids_results:
208                 ids_results[picture_id].merge(result)
209             else:
210                 results.append(result)
211                 ids_results[picture_id] = result
212
213     def ensure_exists(r):
214         try:
215             return r.picture
216         except Picture.DoesNotExist:
217             return False
218
219     results = filter(ensure_exists, results)
220     return results
221
222
223 def search_pd_authors(query):
224     pd_authors = Author.objects.filter(name__icontains=query)
225     existing_slugs = Tag.objects.filter(
226         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
227         .values_list('slug', flat=True)
228     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
229     return pd_authors
230
231
232 def prepare_query(query):
233     query = ' '.join(query.split())
234     # filter out private use characters
235     import unicodedata
236     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
237     query = remove_query_syntax_chars(query)
238
239     words = query.split()
240     if len(words) > 10:
241         query = ' '.join(words[:10])
242     return query