6120d25fec6b218ac947c5fa56e6e3038f3b7553
[wolnelektury.git] / src / search / views.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6 from django.http.response import HttpResponseRedirect
7 from django.shortcuts import render_to_response
8 from django.template import RequestContext
9 from django.views.decorators import cache
10 from django.http import HttpResponse, JsonResponse
11
12 from catalogue.models import Book, Tag
13 from pdcounter.models import Author
14 from picture.models import Picture
15 from search.index import Search, SearchResult, PictureResult
16 from suggest.forms import PublishingSuggestForm
17 import re
18 import json
19
20 from wolnelektury.utils import re_escape
21
22
23 def match_word_re(word):
24     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
25         return r"\b%s\b" % word
26     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
27         return "[[:<:]]%s[[:>:]]" % word
28
29
30 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
31
32
33 def remove_query_syntax_chars(query, replace=' '):
34     return query_syntax_chars.sub(replace, query)
35
36
37 def did_you_mean(query, tokens):
38     return query
39     # change = {}
40     # for t in tokens:
41     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
42     #     if len(authors) > 0:
43     #         continue
44
45     #     if False:
46     #         if not dictionary.check(t):
47     #             try:
48     #                 change_to = dictionary.suggest(t)[0].lower()
49     #                 if change_to != t.lower():
50     #                     change[t] = change_to
51     #             except IndexError:
52     #                 pass
53
54     # if change == {}:
55     #     return None
56
57     # for frm, to in change.items():
58     #     query = query.replace(frm, to)
59
60     # return query
61
62
63 @cache.never_cache
64 def hint(request):
65     prefix = request.GET.get('term', '')
66     if len(prefix) < 2:
67         return JsonResponse([], safe=False)
68
69     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
70
71     try:
72         limit = int(request.GET.get('max', ''))
73     except ValueError:
74         limit = 20
75     else:
76         if limit < 1:
77             limit = 20
78
79     authors = Tag.objects.filter(
80         category='author', name_pl__iregex='\m' + prefix).only('name', 'id', 'slug', 'category')
81     data = [
82         {
83             'label': author.name,
84             'id': author.id,
85             'url': author.get_absolute_url(),
86         }
87         for author in authors[:limit]
88     ]
89     if len(data) < limit:
90         data += [
91             {
92                 'label': b.title,
93                 'author': b.author_unicode(),
94                 'id': b.id,
95                 'url': b.get_absolute_url()
96             }
97             for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
98         ]
99     callback = request.GET.get('callback', None)
100     if callback:
101         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
102                             content_type="application/json; charset=utf-8")
103     else:
104         return JsonResponse(data, safe=False)
105
106
107 @cache.never_cache
108 def main(request):
109     return HttpResponse('Search is temporarily disabled', status=503)
110     query = request.GET.get('q', '')
111     if len(query) < 2:
112         return render_to_response(
113             'catalogue/search_too_short.html', {'prefix': query},
114             context_instance=RequestContext(request))
115     elif len(query) > 256:
116         return render_to_response(
117             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
118
119     query = prepare_query(query)
120     pd_authors = search_pd_authors(query)
121     books = search_books(query)
122     pictures = search_pictures(query)
123     suggestion = u''
124
125     if not (books or pictures or pd_authors):
126         form = PublishingSuggestForm(initial={"books": query + ", "})
127         return render_to_response(
128             'catalogue/search_no_hits.html',
129             {
130                 'form': form,
131                 'did_you_mean': suggestion
132             },
133             context_instance=RequestContext(request))
134
135     if not (books or pictures) and len(pd_authors) == 1:
136         return HttpResponseRedirect(pd_authors[0].get_absolute_url())
137
138     return render_to_response(
139         'catalogue/search_multiple_hits.html',
140         {
141             'pd_authors': pd_authors,
142             'books': books,
143             'pictures': pictures,
144             'did_you_mean': suggestion
145         },
146         context_instance=RequestContext(request))
147
148
149 def search_books(query):
150     search = Search()
151     results_parts = []
152     search_fields = []
153     words = query.split()
154     fieldsets = (
155         (['authors'], True),
156         (['title'], True),
157         (['metadata'], True),
158         (['text', 'themes_pl'], False),
159     )
160     for fields, is_book in fieldsets:
161         search_fields += fields
162         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
163     results = []
164     ids_results = {}
165     for results_part in results_parts:
166         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
167             book_id = result.book_id
168             if book_id in ids_results:
169                 ids_results[book_id].merge(result)
170             else:
171                 results.append(result)
172                 ids_results[book_id] = result
173     descendant_ids = set(
174         Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
175     results = [result for result in results if result.book_id not in descendant_ids]
176     for result in results:
177         search.get_snippets(result, query, num=3)
178
179     def ensure_exists(r):
180         try:
181             return r.book
182         except Book.DoesNotExist:
183             return False
184
185     results = filter(ensure_exists, results)
186     return results
187
188
189 def search_pictures(query):
190     search = Search()
191     results_parts = []
192     search_fields = []
193     words = query.split()
194     fieldsets = (
195         (['authors'], True),
196         (['title'], True),
197         (['metadata'], True),
198         (['themes_pl'], False),
199     )
200     for fields, is_book in fieldsets:
201         search_fields += fields
202         results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
203     results = []
204     ids_results = {}
205     for results_part in results_parts:
206         for result in sorted(PictureResult.aggregate(results_part), reverse=True):
207             picture_id = result.picture_id
208             if picture_id in ids_results:
209                 ids_results[picture_id].merge(result)
210             else:
211                 results.append(result)
212                 ids_results[picture_id] = result
213
214     def ensure_exists(r):
215         try:
216             return r.picture
217         except Picture.DoesNotExist:
218             return False
219
220     results = filter(ensure_exists, results)
221     return results
222
223
224 def search_pd_authors(query):
225     pd_authors = Author.objects.filter(name__icontains=query)
226     existing_slugs = Tag.objects.filter(
227         category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
228         .values_list('slug', flat=True)
229     pd_authors = pd_authors.exclude(slug__in=existing_slugs)
230     return pd_authors
231
232
233 def prepare_query(query):
234     query = ' '.join(query.split())
235     # filter out private use characters
236     import unicodedata
237     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
238     query = remove_query_syntax_chars(query)
239
240     words = query.split()
241     if len(words) > 10:
242         query = ' '.join(words[:10])
243     return query