search fix (WHAT HAVE I DONE)
[wolnelektury.git] / src / search / views.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6 from django.shortcuts import render_to_response
7 from django.template import RequestContext
8 from django.views.decorators import cache
9 from django.http import HttpResponse, JsonResponse
10 from django.utils.translation import ugettext as _
11
12 from catalogue.utils import split_tags
13 from catalogue.models import Book
14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
15 from search.index import Search, SearchResult
16 from suggest.forms import PublishingSuggestForm
17 import re
18 import json
19
20
21 def match_word_re(word):
22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
23         return r"\b%s\b" % word
24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
25         return "[[:<:]]%s[[:>:]]" % word
26
27
28 query_syntax_chars = re.compile(r"[\\/*:(){}]")
29
30
31 def remove_query_syntax_chars(query, replace=' '):
32     return query_syntax_chars.sub(' ', query)
33
34
35 def did_you_mean(query, tokens):
36     return query
37     # change = {}
38     # for t in tokens:
39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
40     #     if len(authors) > 0:
41     #         continue
42
43     #     if False:
44     #         if not dictionary.check(t):
45     #             try:
46     #                 change_to = dictionary.suggest(t)[0].lower()
47     #                 if change_to != t.lower():
48     #                     change[t] = change_to
49     #             except IndexError:
50     #                 pass
51
52     # if change == {}:
53     #     return None
54
55     # for frm, to in change.items():
56     #     query = query.replace(frm, to)
57
58     # return query
59
60
61 @cache.never_cache
62 def hint(request):
63     prefix = request.GET.get('term', '')
64     if len(prefix) < 2:
65         return JsonResponse([], safe=False)
66
67     prefix = remove_query_syntax_chars(prefix)
68
69     search = Search()
70     # tagi beda ograniczac tutaj
71     # ale tagi moga byc na ksiazce i na fragmentach
72     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
73     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
74
75     def is_dupe(tag):
76         if isinstance(tag, PDCounterAuthor):
77             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
78                 return True
79         elif isinstance(tag, PDCounterBook):
80             if filter(lambda b: b.slug == tag.slug, tags):
81                 return True
82         return False
83
84     def category_name(c):
85         if c.startswith('pd_'):
86             c = c[len('pd_'):]
87         return _(c)
88
89     try:
90         limit = int(request.GET.get('max', ''))
91     except ValueError:
92         limit = -1
93     else:
94         if limit < 1:
95             limit = -1
96
97     data = []
98
99     tags = search.hint_tags(prefix, pdcounter=True)
100     tags = filter(lambda t: not is_dupe(t), tags)
101     for t in tags:
102         if not limit:
103             break
104         limit -= 1
105         data.append({
106             'label': t.name,
107             'category': category_name(t.category),
108             'id': t.id,
109             'url': t.get_absolute_url()
110             })
111     if limit:
112         books = search.hint_books(prefix)
113         for b in books:
114             if not limit:
115                 break
116             limit -= 1
117             data.append({
118                 'label': b.title,
119                 'category': _('book'),
120                 'id': b.id,
121                 'url': b.get_absolute_url()
122                 })
123
124     callback = request.GET.get('callback', None)
125     if callback:
126         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
127                             content_type="application/json; charset=utf-8")
128     else:
129         return JsonResponse(data, safe=False)
130
131
132 @cache.never_cache
133 def main(request):
134     query = request.GET.get('q', '')
135     query = ' '.join(query.split())
136     # filter out private use characters
137     import unicodedata
138     query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
139
140     if len(query) < 2:
141         return render_to_response(
142             'catalogue/search_too_short.html', {'prefix': query},
143             context_instance=RequestContext(request))
144     elif len(query) > 256:
145         return render_to_response(
146             'catalogue/search_too_long.html', {'prefix': query}, context_instance=RequestContext(request))
147
148     query = remove_query_syntax_chars(query)
149     
150     search = Search()
151
152     theme_terms = search.index.analyze(text=query, field="themes_pl") \
153         + search.index.analyze(text=query, field="themes")
154
155     # change hints
156     tags = search.hint_tags(query, pdcounter=True, prefix=False)
157     tags = split_tags(tags)
158
159     author_results = search.search_phrase(query, 'authors', book=True)
160     translator_results = search.search_phrase(query, 'translators', book=True)
161
162     title_results = search.search_phrase(query, 'title', book=True)
163
164     # Boost main author/title results with mixed search, and save some of its results for end of list.
165     # boost author, title results
166     author_title_mixed = search.search_some(query, ['authors', 'translators', 'title', 'tags'], query_terms=theme_terms)
167     author_title_rest = []
168
169     for b in author_title_mixed:
170         also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + translator_results + title_results)
171         for b2 in also_in_mixed:
172             b2.boost *= 1.1
173         if also_in_mixed is []:
174             author_title_rest.append(b)
175
176     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
177     # Because the query is using only one field.
178     text_phrase = SearchResult.aggregate(
179         search.search_phrase(query, 'text', snippets=True, book=False),
180         search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms))
181
182     everywhere = search.search_everywhere(query, query_terms=theme_terms)
183
184     def already_found(results):
185         def f(e):
186             for r in results:
187                 if e.book_id == r.book_id:
188                     e.boost = 0.9
189                     results.append(e)
190                     return True
191             return False
192         return f
193     f = already_found(author_results + translator_results + title_results + text_phrase)
194     everywhere = filter(lambda x: not f(x), everywhere)
195
196     author_results = SearchResult.aggregate(author_results)
197     translator_results = SearchResult.aggregate(translator_results)
198     title_results = SearchResult.aggregate(title_results)
199
200     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
201
202     for field, res in [('authors', author_results),
203                        ('translators', translator_results),
204                        ('title', title_results),
205                        ('text', text_phrase),
206                        ('text', everywhere)]:
207         res.sort(reverse=True)
208         for r in res:
209             search.get_snippets(r, query, field, 3)
210
211     suggestion = u''
212
213     def ensure_exists(r):
214         try:
215             return r.book
216         except Book.DoesNotExist:
217             return False
218
219     author_results = filter(ensure_exists, author_results)
220     translator_results = filter(ensure_exists, translator_results)
221     title_results = filter(ensure_exists, title_results)
222     text_phrase = filter(ensure_exists, text_phrase)
223     everywhere = filter(ensure_exists, everywhere)
224
225     results = author_results + translator_results + title_results + text_phrase + everywhere
226     # ensure books do exists & sort them
227     for res in (author_results, translator_results, title_results, text_phrase, everywhere):
228         res.sort(reverse=True)
229
230     # We don't want to redirect to book text, but rather display result page even with one result.
231     # if len(results) == 1:
232     #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
233     #     if len(fragment_hits) == 1:
234     #         #anchor = fragment_hits[0]['fragment']
235     #         #frag = Fragment.objects.get(anchor=anchor)
236     #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
237     #     return HttpResponseRedirect(results[0].book.get_absolute_url())
238     if len(results) == 0:
239         form = PublishingSuggestForm(initial={"books": query + ", "})
240         return render_to_response(
241             'catalogue/search_no_hits.html',
242             {
243                 'tags': tags,
244                 'prefix': query,
245                 'form': form,
246                 'did_you_mean': suggestion
247             },
248             context_instance=RequestContext(request))
249
250     return render_to_response(
251         'catalogue/search_multiple_hits.html',
252         {
253             'tags': tags,
254             'prefix': query,
255             'results': {
256                 'author': author_results,
257                 'translator': translator_results,
258                 'title': title_results,
259                 'content': text_phrase,
260                 'other': everywhere
261             },
262             'did_you_mean': suggestion
263         },
264         context_instance=RequestContext(request))