56eef0976f9d1e2f097ef40d0046d47b50fb0935
[wolnelektury.git] / apps / search / views.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from django.shortcuts import render_to_response, get_object_or_404
5 from django.template import RequestContext
6 from django.contrib.auth.decorators import login_required
7 from django.views.decorators import cache
8 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
9 from django.utils.translation import ugettext as _
10
11 from catalogue.utils import split_tags
12 from catalogue.models import Book, Tag, Fragment
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from catalogue.views import JSONResponse
15 from search import Search, JVM, SearchResult
16 from lucene import StringReader
17 from suggest.forms import PublishingSuggestForm
18 from time import sleep
19 import re
20 import enchant
21
22 dictionary = enchant.Dict('pl_PL')
23
24
25 def match_word_re(word):
26     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
27         return r"\b%s\b" % word
28     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
29         return "[[:<:]]%s[[:>:]]" % word
30
31
32 def did_you_mean(query, tokens):
33     change = {}
34     for t in tokens:
35         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
36         if len(authors) > 0:
37             continue
38
39         if not dictionary.check(t):
40             try:
41                 change_to = dictionary.suggest(t)[0].lower()
42                 if change_to != t.lower():
43                     change[t] = change_to
44             except IndexError:
45                 pass
46
47     if change == {}:
48         return None
49
50     for frm, to in change.items():
51         query = query.replace(frm, to)
52
53     return query
54
55
56 JVM.attachCurrentThread()
57 _search = None
58
59
60 def get_search():
61     global _search
62
63     while _search is False:
64         sleep(1)
65
66     if _search is None:
67         _search = False
68         _search = Search()
69     return _search
70
71
72 def hint(request):
73     prefix = request.GET.get('term', '')
74     if len(prefix) < 2:
75         return JSONResponse([])
76     JVM.attachCurrentThread()
77
78     search = get_search()
79     hint = search.hint()
80     try:
81         tags = request.GET.get('tags', '')
82         hint.tags(Tag.get_tag_list(tags))
83     except:
84         pass
85
86     # tagi beda ograniczac tutaj
87     # ale tagi moga byc na ksiazce i na fragmentach
88     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
89     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
90
91     tags = search.hint_tags(prefix, pdcounter=True)
92     books = search.hint_books(prefix)
93
94     
95     def is_dupe(tag):
96         if isinstance(tag, PDCounterAuthor):
97             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
98                 return True
99         elif isinstance(tag, PDCounterBook):
100             if filter(lambda b: b.slug == tag.slug, tags):
101                 return True
102         return False
103
104     tags = filter(lambda t: not is_dupe(t), tags)
105
106     def category_name(c):
107         if c.startswith('pd_'):
108             c = c[len('pd_'):]
109         return _(c)
110
111     return JSONResponse(
112         [{'label': t.name,
113           'category': category_name(t.category),
114           'id': t.id,
115           'url': t.get_absolute_url()}
116           for t in tags] + \
117           [{'label': b.title,
118             'category': _('book'),
119             'id': b.id,
120             'url': b.get_absolute_url()}
121             for b in books])
122
123
124 def main(request):
125     results = {}
126     JVM.attachCurrentThread()  # where to put this?
127
128     results = None
129     query = None
130     fuzzy = False #0.8
131
132     query = request.GET.get('q','')
133     # book_id = request.GET.get('book', None)
134     # book = None
135     # if book_id is not None:
136     #     book = get_object_or_404(Book, id=book_id)
137
138     # hint = search.hint()
139     # try:
140     #     tag_list = Tag.get_tag_list(tags)
141     # except:
142     #     tag_list = []
143
144     if len(query) < 2:
145         return render_to_response('catalogue/search_too_short.html', {'prefix': query},
146                                   context_instance=RequestContext(request))
147
148     search = get_search()
149     # hint.tags(tag_list)
150     # if book:
151     #     hint.books(book)
152     tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
153     tags = split_tags(tags)
154
155     toks = StringReader(query)
156     tokens_cache = {}
157
158     author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
159     title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
160
161     # Boost main author/title results with mixed search, and save some of its results for end of list.
162     # boost author, title results
163     author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
164     author_title_rest = []
165     for b in author_title_mixed:
166         bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
167         for b2 in bks:
168             b2.boost *= 1.1
169         if bks is []:
170             author_title_rest.append(b)
171
172     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
173     # Because the query is using only one field.
174     text_phrase = SearchResult.aggregate(
175         search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
176         search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
177
178     everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
179
180     def already_found(results):
181         def f(e):
182             for r in results:
183                 if e.book_id == r.book_id:
184                     e.boost = 0.9
185                     results.append(e)
186                     return True
187             return False
188         return f
189     f = already_found(author_results + title_results + text_phrase)
190     everywhere = filter(lambda x: not f(x), everywhere)
191
192     author_results = SearchResult.aggregate(author_results)
193     title_results = SearchResult.aggregate(title_results)
194
195     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
196
197     for res in [author_results, title_results, text_phrase, everywhere]:
198         res.sort(reverse=True)
199         for r in res:
200             for h in r.hits:
201                 h['snippets'] = map(lambda s:
202                                     re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
203                                             re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
204
205     suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
206
207     def ensure_exists(r):
208         try:
209             return r.book
210         except Book.DoesNotExist:
211             return False
212
213     author_results = filter(ensure_exists, author_results)
214     title_results = filter(ensure_exists, title_results)
215     text_phrase = filter(ensure_exists, text_phrase)
216     everywhere = filter(ensure_exists, everywhere)
217
218     results = author_results + title_results + text_phrase + everywhere
219     # ensure books do exists & sort them
220     results.sort(reverse=True)
221
222     if len(results) == 1:
223         fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
224         if len(fragment_hits) == 1:
225             #anchor = fragment_hits[0]['fragment']
226             #frag = Fragment.objects.get(anchor=anchor)
227             return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
228         return HttpResponseRedirect(results[0].book.get_absolute_url())
229     elif len(results) == 0:
230         form = PublishingSuggestForm(initial={"books": query + ", "})
231         return render_to_response('catalogue/search_no_hits.html',
232                                   {'tags': tags,
233                                    'prefix': query,
234                                    "form": form,
235                                    'did_you_mean': suggestion},
236             context_instance=RequestContext(request))
237
238     return render_to_response('catalogue/search_multiple_hits.html',
239                               {'tags': tags,
240                                'prefix': query,
241                                'results': { 'author': author_results,
242                                             'title': title_results,
243                                             'content': text_phrase,
244                                             'other': everywhere},
245                                'did_you_mean': suggestion},
246         context_instance=RequestContext(request))