bump librarian to master
[wolnelektury.git] / apps / search / views.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from django.shortcuts import render_to_response, get_object_or_404
5 from django.template import RequestContext
6 from django.contrib.auth.decorators import login_required
7 from django.views.decorators import cache
8 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
9 from django.utils.translation import ugettext as _
10
11 from catalogue.utils import split_tags
12 from catalogue.models import Book, Tag, Fragment
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from catalogue.views import JSONResponse
15 from search import Search, JVM, SearchResult
16 from lucene import StringReader
17 from suggest.forms import PublishingSuggestForm
18 from time import sleep
19 import re
20 import enchant
21 import json
22
23 dictionary = enchant.Dict('pl_PL')
24
25
26 def match_word_re(word):
27     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
28         return r"\b%s\b" % word
29     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
30         return "[[:<:]]%s[[:>:]]" % word
31
32
33 def did_you_mean(query, tokens):
34     change = {}
35     for t in tokens:
36         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
37         if len(authors) > 0:
38             continue
39
40         if not dictionary.check(t):
41             try:
42                 change_to = dictionary.suggest(t)[0].lower()
43                 if change_to != t.lower():
44                     change[t] = change_to
45             except IndexError:
46                 pass
47
48     if change == {}:
49         return None
50
51     for frm, to in change.items():
52         query = query.replace(frm, to)
53
54     return query
55
56
57 JVM.attachCurrentThread()
58 _search = None
59
60
61 def get_search():
62     global _search
63
64     while _search is False:
65         sleep(1)
66
67     if _search is None:
68         _search = False
69         _search = Search()
70     return _search
71
72
73 def hint(request):
74     prefix = request.GET.get('term', '')
75     if len(prefix) < 2:
76         return JSONResponse([])
77     JVM.attachCurrentThread()
78
79     search = get_search()
80     hint = search.hint()
81     try:
82         tags = request.GET.get('tags', '')
83         hint.tags(Tag.get_tag_list(tags))
84     except:
85         pass
86
87     # tagi beda ograniczac tutaj
88     # ale tagi moga byc na ksiazce i na fragmentach
89     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
90     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
91
92     tags = search.hint_tags(prefix, pdcounter=True)
93     books = search.hint_books(prefix)
94
95     
96     def is_dupe(tag):
97         if isinstance(tag, PDCounterAuthor):
98             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
99                 return True
100         elif isinstance(tag, PDCounterBook):
101             if filter(lambda b: b.slug == tag.slug, tags):
102                 return True
103         return False
104
105     tags = filter(lambda t: not is_dupe(t), tags)
106
107     def category_name(c):
108         if c.startswith('pd_'):
109             c = c[len('pd_'):]
110         return _(c)
111
112     callback = request.GET.get('callback', None)
113     data = [{'label': t.name,
114               'category': category_name(t.category),
115               'id': t.id,
116               'url': t.get_absolute_url()}
117               for t in tags] + \
118               [{'label': b.title,
119                 'category': _('book'),
120                 'id': b.id,
121                 'url': b.get_absolute_url()}
122                 for b in books]
123     if callback:
124         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
125                             content_type="application/json; charset=utf-8")
126     else:
127         return JSONResponse(data)
128             
129
130
131 def main(request):
132     results = {}
133     JVM.attachCurrentThread()  # where to put this?
134
135     results = None
136     query = None
137     fuzzy = False #0.8
138
139     query = request.GET.get('q','')
140     # book_id = request.GET.get('book', None)
141     # book = None
142     # if book_id is not None:
143     #     book = get_object_or_404(Book, id=book_id)
144
145     # hint = search.hint()
146     # try:
147     #     tag_list = Tag.get_tag_list(tags)
148     # except:
149     #     tag_list = []
150
151     if len(query) < 2:
152         return render_to_response('catalogue/search_too_short.html', {'prefix': query},
153                                   context_instance=RequestContext(request))
154
155     search = get_search()
156     # hint.tags(tag_list)
157     # if book:
158     #     hint.books(book)
159     tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
160     tags = split_tags(tags)
161
162     toks = StringReader(query)
163     tokens_cache = {}
164
165     author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
166     title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
167
168     # Boost main author/title results with mixed search, and save some of its results for end of list.
169     # boost author, title results
170     author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
171     author_title_rest = []
172     for b in author_title_mixed:
173         bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
174         for b2 in bks:
175             b2.boost *= 1.1
176         if bks is []:
177             author_title_rest.append(b)
178
179     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
180     # Because the query is using only one field.
181     text_phrase = SearchResult.aggregate(
182         search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
183         search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
184
185     everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
186
187     def already_found(results):
188         def f(e):
189             for r in results:
190                 if e.book_id == r.book_id:
191                     e.boost = 0.9
192                     results.append(e)
193                     return True
194             return False
195         return f
196     f = already_found(author_results + title_results + text_phrase)
197     everywhere = filter(lambda x: not f(x), everywhere)
198
199     author_results = SearchResult.aggregate(author_results)
200     title_results = SearchResult.aggregate(title_results)
201
202     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
203
204     for res in [author_results, title_results, text_phrase, everywhere]:
205         res.sort(reverse=True)
206         for r in res:
207             for h in r.hits:
208                 h['snippets'] = map(lambda s:
209                                     re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
210                                             re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
211
212     suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
213
214     def ensure_exists(r):
215         try:
216             return r.book
217         except Book.DoesNotExist:
218             return False
219
220     author_results = filter(ensure_exists, author_results)
221     title_results = filter(ensure_exists, title_results)
222     text_phrase = filter(ensure_exists, text_phrase)
223     everywhere = filter(ensure_exists, everywhere)
224
225     results = author_results + title_results + text_phrase + everywhere
226     # ensure books do exists & sort them
227     results.sort(reverse=True)
228
229     if len(results) == 1:
230         fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
231         if len(fragment_hits) == 1:
232             #anchor = fragment_hits[0]['fragment']
233             #frag = Fragment.objects.get(anchor=anchor)
234             return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
235         return HttpResponseRedirect(results[0].book.get_absolute_url())
236     elif len(results) == 0:
237         form = PublishingSuggestForm(initial={"books": query + ", "})
238         return render_to_response('catalogue/search_no_hits.html',
239                                   {'tags': tags,
240                                    'prefix': query,
241                                    "form": form,
242                                    'did_you_mean': suggestion},
243             context_instance=RequestContext(request))
244
245     return render_to_response('catalogue/search_multiple_hits.html',
246                               {'tags': tags,
247                                'prefix': query,
248                                'results': { 'author': author_results,
249                                             'title': title_results,
250                                             'content': text_phrase,
251                                             'other': everywhere},
252                                'did_you_mean': suggestion},
253         context_instance=RequestContext(request))