escape user-provided strings used in regular expressions
[wolnelektury.git] / src / search / views.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6 from django.shortcuts import render_to_response
7 from django.template import RequestContext
8 from django.views.decorators import cache
9 from django.http import HttpResponse, JsonResponse
10 from django.utils.translation import ugettext as _
11
12 from catalogue.utils import split_tags
13 from catalogue.models import Book, Tag
14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
15 from search.index import Search, SearchResult
16 from suggest.forms import PublishingSuggestForm
17 import re
18 import json
19
20 from wolnelektury.utils import re_escape
21
22
23 def match_word_re(word):
24     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
25         return r"\b%s\b" % word
26     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
27         return "[[:<:]]%s[[:>:]]" % word
28
29
30 query_syntax_chars = re.compile(r"[\\/*:(){}]")
31
32
33 def remove_query_syntax_chars(query, replace=' '):
34     return query_syntax_chars.sub(replace, query)
35
36
37 def did_you_mean(query, tokens):
38     return query
39     # change = {}
40     # for t in tokens:
41     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
42     #     if len(authors) > 0:
43     #         continue
44
45     #     if False:
46     #         if not dictionary.check(t):
47     #             try:
48     #                 change_to = dictionary.suggest(t)[0].lower()
49     #                 if change_to != t.lower():
50     #                     change[t] = change_to
51     #             except IndexError:
52     #                 pass
53
54     # if change == {}:
55     #     return None
56
57     # for frm, to in change.items():
58     #     query = query.replace(frm, to)
59
60     # return query
61
62
63 @cache.never_cache
64 def hint(request):
65     prefix = request.GET.get('term', '')
66     if len(prefix) < 2:
67         return JsonResponse([], safe=False)
68
69     prefix = re_escape(' '.join(remove_query_syntax_chars(prefix).split()))
70
71     try:
72         limit = int(request.GET.get('max', ''))
73     except ValueError:
74         limit = -1
75     else:
76         if limit < 1:
77             limit = -1
78
79     data = [
80         {
81             'label': author.name,
82             'category': _('author'),
83             'id': author.id,
84             'url': author.get_absolute_url(),
85         }
86         for author in Tag.objects.filter(category='author', name__iregex=u'\m' + prefix)[:10]
87     ]
88     if len(data) < limit:
89         data += [
90             {
91                 'label': '<cite>%s</cite>, %s' % (b.title, b.author_unicode()),
92                 'category': _('book'),
93                 'id': b.id,
94                 'url': b.get_absolute_url()
95             }
96             for b in Book.objects.filter(title__iregex='\m' + prefix)[:limit-len(data)]
97         ]
98     callback = request.GET.get('callback', None)
99     if callback:
100         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
101                             content_type="application/json; charset=utf-8")
102     else:
103         return JsonResponse(data, safe=False)
104
105
106 @cache.never_cache
107 def main(request):
108     query = request.GET.get('q', '')
109     query = ' '.join(query.split())
110     # filter out private use characters
111     import unicodedata
112     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
113
114     if len(query) < 2:
115         return render_to_response(
116             'catalogue/search_too_short.html', {'prefix': query},
117             context_instance=RequestContext(request))
118     elif len(query) > 256:
119         return render_to_response(
120             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
121
122     query = remove_query_syntax_chars(query)
123
124     words = query.split()
125     if len(words) > 10:
126         query = ' '.join(words[:10])
127
128     search = Search()
129
130     tags = search.hint_tags(query, pdcounter=True, prefix=False)
131     tags = split_tags(tags)
132
133     results_parts = []
134
135     search_fields = []
136     fieldsets = (
137         (['authors'], True),
138         (['title'], True),
139         (['metadata'], True),
140         (['text', 'themes_pl'], False),
141     )
142     for fieldset, is_book in fieldsets:
143         search_fields += fieldset
144         results_parts.append(search.search_words(words, search_fields, book=is_book))
145
146     results = []
147     ids_results = {}
148     for results_part in results_parts:
149         for result in sorted(SearchResult.aggregate(results_part), reverse=True):
150             book_id = result.book_id
151             if book_id in ids_results:
152                 ids_results[book_id].merge(result)
153             else:
154                 results.append(result)
155                 ids_results[book_id] = result
156
157     for result in results:
158         search.get_snippets(result, query, num=3)
159
160     suggestion = u''
161
162     def ensure_exists(r):
163         try:
164             return r.book
165         except Book.DoesNotExist:
166             return False
167
168     results = filter(ensure_exists, results)
169
170     if not results:
171         form = PublishingSuggestForm(initial={"books": query + ", "})
172         return render_to_response(
173             'catalogue/search_no_hits.html',
174             {
175                 'tags': tags,
176                 'prefix': query,
177                 'form': form,
178                 'did_you_mean': suggestion
179             },
180             context_instance=RequestContext(request))
181
182     return render_to_response(
183         'catalogue/search_multiple_hits.html',
184         {
185             'tags': tags['author'] + tags['kind'] + tags['genre'] + tags['epoch'] + tags['theme'],
186             'prefix': query,
187             'results': results,
188             'did_you_mean': suggestion
189         },
190         context_instance=RequestContext(request))