Generally working version.
[wolnelektury.git] / src / search / views.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6 from django.shortcuts import render_to_response, get_object_or_404
7 from django.template import RequestContext
8 from django.views.decorators import cache
9 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect, JsonResponse
10 from django.utils.translation import ugettext as _
11
12 from catalogue.utils import split_tags
13 from catalogue.models import Book, Tag, Fragment
14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
15 from search.index import Search, SearchResult
16 from suggest.forms import PublishingSuggestForm
17 import re
18 #import enchant
19 import json
20
21
22 def match_word_re(word):
23     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
24         return r"\b%s\b" % word
25     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
26         return "[[:<:]]%s[[:>:]]" % word
27
28
29 query_syntax_chars = re.compile(r"[\\/*:(){}]")
30
31
32 def remove_query_syntax_chars(query, replace=' '):
33     return query_syntax_chars.sub(' ', query)
34
35
36 def did_you_mean(query, tokens):
37     return query
38     # change = {}
39     # for t in tokens:
40     #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
41     #     if len(authors) > 0:
42     #         continue
43
44     #     if False:
45     #         if not dictionary.check(t):
46     #             try:
47     #                 change_to = dictionary.suggest(t)[0].lower()
48     #                 if change_to != t.lower():
49     #                     change[t] = change_to
50     #             except IndexError:
51     #                 pass
52
53     # if change == {}:
54     #     return None
55
56     # for frm, to in change.items():
57     #     query = query.replace(frm, to)
58
59     # return query
60
61
62 @cache.never_cache
63 def hint(request):
64     prefix = request.GET.get('term', '')
65     if len(prefix) < 2:
66         return JsonResponse([], safe=False)
67
68     prefix = remove_query_syntax_chars(prefix)
69
70     search = Search()
71     # tagi beda ograniczac tutaj
72     # ale tagi moga byc na ksiazce i na fragmentach
73     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
74     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
75
76     def is_dupe(tag):
77         if isinstance(tag, PDCounterAuthor):
78             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
79                 return True
80         elif isinstance(tag, PDCounterBook):
81             if filter(lambda b: b.slug == tag.slug, tags):
82                 return True
83         return False
84
85     def category_name(c):
86         if c.startswith('pd_'):
87             c = c[len('pd_'):]
88         return _(c)
89
90     try:
91         limit = int(request.GET.get('max', ''))
92     except ValueError:
93         limit = -1
94     else:
95         if limit < 1:
96             limit = -1
97
98     data = []
99
100     tags = search.hint_tags(prefix, pdcounter=True)
101     tags = filter(lambda t: not is_dupe(t), tags)
102     for t in tags:
103         if not limit:
104             break
105         limit -= 1
106         data.append({
107             'label': t.name,
108             'category': category_name(t.category),
109             'id': t.id,
110             'url': t.get_absolute_url()
111             })
112     if limit:
113         books = search.hint_books(prefix)
114         for b in books:
115             if not limit:
116                 break
117             limit -= 1
118             data.append({
119                 'label': b.title,
120                 'category': _('book'),
121                 'id': b.id,
122                 'url': b.get_absolute_url()
123                 })
124
125     callback = request.GET.get('callback', None)
126     if callback:
127         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
128                             content_type="application/json; charset=utf-8")
129     else:
130         return JsonResponse(data, safe=False)
131
132
133 @cache.never_cache
134 def main(request):
135     results = {}
136
137     results = None
138     query = None
139
140     query = request.GET.get('q', '')
141
142     if len(query) < 2:
143         return render_to_response('catalogue/search_too_short.html',
144                                   {'prefix': query},
145             context_instance=RequestContext(request))
146
147     query = remove_query_syntax_chars(query)
148     
149     search = Search()
150
151     theme_terms = search.index.analyze(text=query, field="themes_pl") \
152         + search.index.analyze(text=query, field="themes")
153
154             # change hints
155     tags = search.hint_tags(query, pdcounter=True, prefix=False)
156     tags = split_tags(tags)
157
158     author_results = search.search_phrase(query, 'authors', book=True)
159     translator_results = search.search_phrase(query, 'translators', book=True)
160
161     title_results = search.search_phrase(query, 'title', book=True)
162
163     # Boost main author/title results with mixed search, and save some of its results for end of list.
164     # boost author, title results
165     author_title_mixed = search.search_some(query, ['authors', 'translators', 'title', 'tags'], query_terms=theme_terms)
166     author_title_rest = []
167
168     for b in author_title_mixed:
169         also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + translator_results + title_results)
170         for b2 in also_in_mixed:
171             b2.boost *= 1.1
172         if also_in_mixed is []:
173             author_title_rest.append(b)
174
175     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
176     # Because the query is using only one field.
177     text_phrase = SearchResult.aggregate(
178         search.search_phrase(query, 'text', snippets=True, book=False),
179         search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms))
180
181     everywhere = search.search_everywhere(query, query_terms=theme_terms)
182
183     def already_found(results):
184         def f(e):
185             for r in results:
186                 if e.book_id == r.book_id:
187                     e.boost = 0.9
188                     results.append(e)
189                     return True
190             return False
191         return f
192     f = already_found(author_results + translator_results + title_results + text_phrase)
193     everywhere = filter(lambda x: not f(x), everywhere)
194
195     author_results = SearchResult.aggregate(author_results)
196     translator_results = SearchResult.aggregate(translator_results)
197     title_results = SearchResult.aggregate(title_results)
198
199     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
200
201     for field, res in [('authors', author_results),
202                        ('translators', translator_results),
203                        ('title', title_results),
204                        ('text', text_phrase),
205                        ('text', everywhere)]:
206         res.sort(reverse=True)
207         for r in res:
208             search.get_snippets(r, query, field, 3)
209
210     suggestion = u''
211
212     def ensure_exists(r):
213         try:
214             return r.book
215         except Book.DoesNotExist:
216             return False
217
218     author_results = filter(ensure_exists, author_results)
219     translator_results = filter(ensure_exists, translator_results)
220     title_results = filter(ensure_exists, title_results)
221     text_phrase = filter(ensure_exists, text_phrase)
222     everywhere = filter(ensure_exists, everywhere)
223
224     results = author_results + translator_results + title_results + text_phrase + everywhere
225     # ensure books do exists & sort them
226     for res in (author_results, translator_results, title_results, text_phrase, everywhere):
227         res.sort(reverse=True)
228
229     # We don't want to redirect to book text, but rather display result page even with one result.
230     # if len(results) == 1:
231     #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
232     #     if len(fragment_hits) == 1:
233     #         #anchor = fragment_hits[0]['fragment']
234     #         #frag = Fragment.objects.get(anchor=anchor)
235     #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
236     #     return HttpResponseRedirect(results[0].book.get_absolute_url())
237     if len(results) == 0:
238         form = PublishingSuggestForm(initial={"books": query + ", "})
239         return render_to_response('catalogue/search_no_hits.html',
240                                   {'tags': tags,
241                                    'prefix': query,
242                                    "form": form,
243                                    'did_you_mean': suggestion},
244             context_instance=RequestContext(request))
245
246     return render_to_response('catalogue/search_multiple_hits.html',
247                               {'tags': tags,
248                                'prefix': query,
249                                'results': {'author': author_results,
250                                            'translator': translator_results,
251                                            'title': title_results,
252                                            'content': text_phrase,
253                                            'other': everywhere},
254                                'did_you_mean': suggestion},
255         context_instance=RequestContext(request))