single search query with boosts - stub
[wolnelektury.git] / src / search / views.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6 from django.http.response import HttpResponseRedirect
7 from django.shortcuts import render_to_response
8 from django.template import RequestContext
9 from django.views.decorators import cache
10 from django.http import HttpResponse, JsonResponse
11
12 from catalogue.models import Book, Tag
13 from pdcounter.models import Author
14 from picture.models import Picture
15 from search.index import Search, SearchResult, PictureResult
16 from suggest.forms import PublishingSuggestForm
17 import re
18 import json
19
20 from wolnelektury.utils import re_escape
21
22
23 def match_word_re(word):
24     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
25         return r"\b%s\b" % word
26     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
27         return "[[:<:]]%s[[:>:]]" % word
28
29
30 query_syntax_chars = re.compile(r"[\\/*:(){}]")
31
32
33 def remove_query_syntax_chars(query, replace=' '):
34     return query_syntax_chars.sub(replace, query)
35
36
37 def did_you_mean(query, tokens):
38     return query
39     # change = {}
40     # for t in tokens:
41     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
42     #     if len(authors) > 0:
43     #         continue
44
45     #     if False:
46     #         if not dictionary.check(t):
47     #             try:
48     #                 change_to = dictionary.suggest(t)[0].lower()
49     #                 if change_to != t.lower():
50     #                     change[t] = change_to
51     #             except IndexError:
52     #                 pass
53
54     # if change == {}:
55     #     return None
56
57     # for frm, to in change.items():
58     #     query = query.replace(frm, to)
59
60     # return query
61
62
63 @cache.never_cache
64 def hint(request):
65     prefix = request.GET.get('term', '')
66     if len(prefix) < 2:
67         return JsonResponse([], safe=False)
68
69     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
70
71     try:
72         limit = int(request.GET.get('max', ''))
73     except ValueError:
74         limit = 20
75     else:
76         if limit < 1:
77             limit = 20
78
79     authors = Tag.objects.filter(
80         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
81     data = [
82         {
83             'label': author.name,
84             'id': author.id,
85             'url': author.get_absolute_url(),
86         }
87         for author in authors[:limit]
88     ]
89     if len(data) < limit:
90         data += [
91             {
92                 'label': b.title,
93                 'author': b.author_unicode(),
94                 'id': b.id,
95                 'url': b.get_absolute_url()
96             }
97             for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
98         ]
99     callback = request.GET.get('callback', None)
100     if callback:
101         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
102                             content_type="application/json; charset=utf-8")
103     else:
104         return JsonResponse(data, safe=False)
105
106
107 @cache.never_cache
108 def main(request):
109     query = request.GET.get('q', '')
110     if len(query) < 2:
111         return render_to_response(
112             'catalogue/search_too_short.html', {'prefix': query},
113             context_instance=RequestContext(request))
114     elif len(query) > 256:
115         return render_to_response(
116             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
117
118     query = prepare_query(query)
119     pd_authors = search_pd_authors(query)
120     books = search_books(query)
121     pictures = search_pictures(query)
122     suggestion = u''
123
124     if not (books or pictures or pd_authors):
125         form = PublishingSuggestForm(initial={"books": query + ", "})
126         return render_to_response(
127             'catalogue/search_no_hits.html',
128             {
129                 'form': form,
130                 'did_you_mean': suggestion
131             },
132             context_instance=RequestContext(request))
133
134     if not (books or pictures) and len(pd_authors) == 1:
135         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
136
137     return render_to_response(
138         'catalogue/search_multiple_hits.html',
139         {
140             'pd_authors': pd_authors,
141             'books': books,
142             'pictures': pictures,
143             'did_you_mean': suggestion
144         },
145         context_instance=RequestContext(request))
146
147
148 def search_books(query):
149     search = Search()
150     # results_parts = []
151     # search_fields = []
152     words = query.split()
153     fieldsets = (
154         (['authors'], True, 8),
155         (['title'], True, 4),
156         (['metadata'], True, 2),
157         (['text', 'themes_pl'], False, 1),
158     )
159     # for fields, is_book in fieldsets:
160     #     search_fields += fields
161     #     results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
162     query_results = search.search_words(words, fieldsets)
163     results = []
164     ids_results = {}
165     # for results_part in results_parts:
166     for result in sorted(SearchResult.aggregate(query_results), reverse=True):
167         book_id = result.book_id
168         if book_id in ids_results:
169             ids_results[book_id].merge(result)
170         else:
171             results.append(result)
172             ids_results[book_id] = result
173     descendant_ids = set(
174         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
175     results = [result for result in results if result.book_id not in descendant_ids]
176     for result in results:
177         search.get_snippets(result, query, num=3)
178
179     def ensure_exists(r):
180         try:
181             return r.book
182         except Book.DoesNotExist:
183             return False
184
185     results = filter(ensure_exists, results)
186     return results
187
188
189 def search_pictures(query):
190     search = Search()
191     # results_parts = []
192     # search_fields = []
193     words = query.split()
194     fieldsets = (
195         (['authors'], True, 8),
196         (['title'], True, 4),
197         (['metadata'], True, 2),
198         (['themes_pl'], False, 1),
199     )
200     # for fields, is_book in fieldsets:
201     #     search_fields += fields
202     #     results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
203     query_results = search.search_words(words, fieldsets, picture=True)
204     results = []
205     ids_results = {}
206     # for results_part in results_parts:
207     for result in sorted(PictureResult.aggregate(query_results), reverse=True):
208         picture_id = result.picture_id
209         if picture_id in ids_results:
210             ids_results[picture_id].merge(result)
211         else:
212             results.append(result)
213             ids_results[picture_id] = result
214
215     def ensure_exists(r):
216         try:
217             return r.picture
218         except Picture.DoesNotExist:
219             return False
220
221     results = filter(ensure_exists, results)
222     return results
223
224
225 def search_pd_authors(query):
226     pd_authors = Author.objects.filter(name__icontains=query)
227     existing_slugs = Tag.objects.filter(
228         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
229         .values_list('slug', flat=True)
230     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
231     return pd_authors
232
233
234 def prepare_query(query):
235     query = ' '.join(query.split())
236     # filter out private use characters
237     import unicodedata
238     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
239     query = remove_query_syntax_chars(query)
240
241     words = query.split()
242     if len(words) > 10:
243         query = ' '.join(words[:10])
244     return query