optimize db usage in tagged object list
[wolnelektury.git] / src / search / views.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6 from django.shortcuts import render_to_response
7 from django.template import RequestContext
8 from django.views.decorators import cache
9 from django.http import HttpResponse, JsonResponse
10 from django.utils.translation import ugettext as _
11
12 from catalogue.utils import split_tags
13 from catalogue.models import Book
14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
15 from search.index import Search, SearchResult
16 from suggest.forms import PublishingSuggestForm
17 import re
18 import json
19
20
21 def match_word_re(word):
22     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
23         return r"\b%s\b" % word
24     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
25         return "[[:<:]]%s[[:>:]]" % word
26
27
28 query_syntax_chars = re.compile(r"[\\/*:(){}]")
29
30
31 def remove_query_syntax_chars(query, replace=' '):
32     return query_syntax_chars.sub(' ', query)
33
34
35 def did_you_mean(query, tokens):
36     return query
37     # change = {}
38     # for t in tokens:
39     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
40     #     if len(authors) > 0:
41     #         continue
42
43     #     if False:
44     #         if not dictionary.check(t):
45     #             try:
46     #                 change_to = dictionary.suggest(t)[0].lower()
47     #                 if change_to != t.lower():
48     #                     change[t] = change_to
49     #             except IndexError:
50     #                 pass
51
52     # if change == {}:
53     #     return None
54
55     # for frm, to in change.items():
56     #     query = query.replace(frm, to)
57
58     # return query
59
60
61 @cache.never_cache
62 def hint(request):
63     prefix = request.GET.get('term', '')
64     if len(prefix) < 2:
65         return JsonResponse([], safe=False)
66
67     prefix = remove_query_syntax_chars(prefix)
68
69     search = Search()
70     # tagi beda ograniczac tutaj
71     # ale tagi moga byc na ksiazce i na fragmentach
72     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
73     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
74
75     def is_dupe(tag):
76         if isinstance(tag, PDCounterAuthor):
77             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
78                 return True
79         elif isinstance(tag, PDCounterBook):
80             if filter(lambda b: b.slug == tag.slug, tags):
81                 return True
82         return False
83
84     def category_name(c):
85         if c.startswith('pd_'):
86             c = c[len('pd_'):]
87         return _(c)
88
89     try:
90         limit = int(request.GET.get('max', ''))
91     except ValueError:
92         limit = -1
93     else:
94         if limit < 1:
95             limit = -1
96
97     data = []
98
99     tags = search.hint_tags(prefix, pdcounter=True)
100     tags = filter(lambda t: not is_dupe(t), tags)
101     for t in tags:
102         if not limit:
103             break
104         limit -= 1
105         data.append({
106             'label': t.name,
107             'category': category_name(t.category),
108             'id': t.id,
109             'url': t.get_absolute_url()
110             })
111     if limit:
112         books = search.hint_books(prefix)
113         for b in books:
114             if not limit:
115                 break
116             limit -= 1
117             data.append({
118                 'label': b.title,
119                 'category': _('book'),
120                 'id': b.id,
121                 'url': b.get_absolute_url()
122                 })
123
124     callback = request.GET.get('callback', None)
125     if callback:
126         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
127                             content_type="application/json; charset=utf-8")
128     else:
129         return JsonResponse(data, safe=False)
130
131
132 @cache.never_cache
133 def main(request):
134     query = request.GET.get('q', '')
135
136     if len(query) < 2:
137         return render_to_response(
138             'catalogue/search_too_short.html', {'prefix': query},
139             context_instance=RequestContext(request))
140
141     query = remove_query_syntax_chars(query)
142     
143     search = Search()
144
145     theme_terms = search.index.analyze(text=query, field="themes_pl") \
146         + search.index.analyze(text=query, field="themes")
147
148     # change hints
149     tags = search.hint_tags(query, pdcounter=True, prefix=False)
150     tags = split_tags(tags)
151
152     author_results = search.search_phrase(query, 'authors', book=True)
153     translator_results = search.search_phrase(query, 'translators', book=True)
154
155     title_results = search.search_phrase(query, 'title', book=True)
156
157     # Boost main author/title results with mixed search, and save some of its results for end of list.
158     # boost author, title results
159     author_title_mixed = search.search_some(query, ['authors', 'translators', 'title', 'tags'], query_terms=theme_terms)
160     author_title_rest = []
161
162     for b in author_title_mixed:
163         also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + translator_results + title_results)
164         for b2 in also_in_mixed:
165             b2.boost *= 1.1
166         if also_in_mixed is []:
167             author_title_rest.append(b)
168
169     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
170     # Because the query is using only one field.
171     text_phrase = SearchResult.aggregate(
172         search.search_phrase(query, 'text', snippets=True, book=False),
173         search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms))
174
175     everywhere = search.search_everywhere(query, query_terms=theme_terms)
176
177     def already_found(results):
178         def f(e):
179             for r in results:
180                 if e.book_id == r.book_id:
181                     e.boost = 0.9
182                     results.append(e)
183                     return True
184             return False
185         return f
186     f = already_found(author_results + translator_results + title_results + text_phrase)
187     everywhere = filter(lambda x: not f(x), everywhere)
188
189     author_results = SearchResult.aggregate(author_results)
190     translator_results = SearchResult.aggregate(translator_results)
191     title_results = SearchResult.aggregate(title_results)
192
193     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
194
195     for field, res in [('authors', author_results),
196                        ('translators', translator_results),
197                        ('title', title_results),
198                        ('text', text_phrase),
199                        ('text', everywhere)]:
200         res.sort(reverse=True)
201         for r in res:
202             search.get_snippets(r, query, field, 3)
203
204     suggestion = u''
205
206     def ensure_exists(r):
207         try:
208             return r.book
209         except Book.DoesNotExist:
210             return False
211
212     author_results = filter(ensure_exists, author_results)
213     translator_results = filter(ensure_exists, translator_results)
214     title_results = filter(ensure_exists, title_results)
215     text_phrase = filter(ensure_exists, text_phrase)
216     everywhere = filter(ensure_exists, everywhere)
217
218     results = author_results + translator_results + title_results + text_phrase + everywhere
219     # ensure books do exists & sort them
220     for res in (author_results, translator_results, title_results, text_phrase, everywhere):
221         res.sort(reverse=True)
222
223     # We don't want to redirect to book text, but rather display result page even with one result.
224     # if len(results) == 1:
225     #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
226     #     if len(fragment_hits) == 1:
227     #         #anchor = fragment_hits[0]['fragment']
228     #         #frag = Fragment.objects.get(anchor=anchor)
229     #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
230     #     return HttpResponseRedirect(results[0].book.get_absolute_url())
231     if len(results) == 0:
232         form = PublishingSuggestForm(initial={"books": query + ", "})
233         return render_to_response(
234             'catalogue/search_no_hits.html',
235             {
236                 'tags': tags,
237                 'prefix': query,
238                 'form': form,
239                 'did_you_mean': suggestion
240             },
241             context_instance=RequestContext(request))
242
243     return render_to_response(
244         'catalogue/search_multiple_hits.html',
245         {
246             'tags': tags,
247             'prefix': query,
248             'results': {
249                 'author': author_results,
250                 'translator': translator_results,
251                 'title': title_results,
252                 'content': text_phrase,
253                 'other': everywhere
254             },
255             'did_you_mean': suggestion
256         },
257         context_instance=RequestContext(request))