search fixes
[wolnelektury.git] / apps / search / views.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from django.shortcuts import render_to_response, get_object_or_404
5 from django.template import RequestContext
6 from django.contrib.auth.decorators import login_required
7 from django.views.decorators import cache
8 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
9 from django.utils.translation import ugettext as _
10
11 from catalogue.utils import get_random_hash
12 from catalogue.models import Book, Tag, Fragment
13 from catalogue.fields import dumps
14 from catalogue.views import JSONResponse
15 from search import Search, JVM, SearchResult
16 from lucene import StringReader
17 from suggest.forms import PublishingSuggestForm
18 import re
19 import enchant
20
21 dictionary = enchant.Dict('pl_PL')
22
23
24 def match_word_re(word):
25     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
26         return r"\b%s\b" % word
27     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
28         return "[[:<:]]%s[[:>:]]" % word
29
30
31 def did_you_mean(query, tokens):
32     change = {}
33     for t in tokens:
34         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
35         if len(authors) > 0:
36             continue
37         
38         if not dictionary.check(t):
39             try:
40                 change[t] = dictionary.suggest(t)[0]
41             except IndexError:
42                 pass
43
44     if change == {}:
45         return None
46
47     for frm, to in change.items():
48         query = query.replace(frm, to)
49
50     return query
51
52
53 def hint(request):
54     prefix = request.GET.get('term', '')
55     if len(prefix) < 2:
56         return JSONResponse([])
57     JVM.attachCurrentThread()
58     s = Search()
59
60     hint = s.hint()
61     try:
62         tags = request.GET.get('tags', '')
63         hint.tags(Tag.get_tag_list(tags))
64     except:
65         pass
66
67     # tagi beda ograniczac tutaj
68     # ale tagi moga byc na ksiazce i na fragmentach
69     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
70     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
71
72     tags = s.hint_tags(prefix)
73     books = s.hint_books(prefix)
74
75     # TODO DODAC TU HINTY
76
77     return JSONResponse(
78         [{'label': t.name,
79           'category': _(t.category),
80           'id': t.id,
81           'url': t.get_absolute_url()}
82           for t in tags] + \
83           [{'label': b.title,
84             'category': _('book'),
85             'id': b.id,
86             'url': b.get_absolute_url()}
87             for b in books])
88
89
90 def main(request):
91     results = {}
92     JVM.attachCurrentThread()  # where to put this?
93     srch = Search()
94
95     results = None
96     query = None
97     fuzzy = False
98
99     if 'q' in request.GET:
100         tags = request.GET.get('tags', '')
101         query = request.GET['q']
102         book_id = request.GET.get('book', None)
103         book = None
104         if book_id is not None:
105             book = get_object_or_404(Book, id=book_id)
106
107         hint = srch.hint()
108         try:
109             tag_list = Tag.get_tag_list(tags)
110         except:
111             tag_list = []
112
113         if len(query) < 2:
114             return render_to_response('catalogue/search_too_short.html', {'tags': tag_list, 'prefix': query},
115                                       context_instance=RequestContext(request))
116
117         hint.tags(tag_list)
118         if book:
119             hint.books(book)
120
121         toks = StringReader(query)
122         tokens_cache = {}
123         fuzzy = 'fuzzy' in request.GET
124         if fuzzy:
125             fuzzy = 0.7
126
127         author_results = srch.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
128         title_results = srch.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
129
130         # Boost main author/title results with mixed search, and save some of its results for end of list.
131         # boost author, title results
132         author_title_mixed = srch.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
133         author_title_rest = []
134         for b in author_title_mixed:
135             bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
136             for b2 in bks:
137                 b2.boost *= 1.1
138             if bks is []:
139                 author_title_rest.append(b)
140
141         # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
142         # Because the query is using only one field.
143         text_phrase = SearchResult.aggregate(
144             srch.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
145             srch.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
146
147         everywhere = srch.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
148
149         def already_found(results):
150             def f(e):
151                 for r in results:
152                     if e.book_id == r.book_id:
153                         e.boost = 0.9
154                         results.append(e)
155                         return True
156                 return False
157             return f
158         f = already_found(author_results + title_results + text_phrase)
159         everywhere = filter(lambda x: not f(x), everywhere)
160
161         author_results = SearchResult.aggregate(author_results)
162         title_results = SearchResult.aggregate(title_results)
163         
164         everywhere = SearchResult.aggregate(everywhere, author_title_rest)
165
166         for res in [author_results, title_results, text_phrase, everywhere]:
167             res.sort(reverse=True)
168             for r in res:
169                 for h in r.hits:
170                     h['snippets'] = map(lambda s:
171                                         re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"", 
172                                                 re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
173                     
174         suggestion = did_you_mean(query, srch.get_tokens(toks, field="SIMPLE"))
175         print "dym? %s" % repr(suggestion).encode('utf-8')
176         
177         results = author_results + title_results + text_phrase + everywhere
178         results.sort(reverse=True)
179         
180         if len(results) == 1:
181             fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
182             if len(fragment_hits) == 1:
183                 anchor = fragment_hits[0]['fragment']
184                 frag = Fragment.objects.get(anchor=anchor)
185                 return HttpResponseRedirect(frag.get_absolute_url())
186             return HttpResponseRedirect(results[0].book.get_absolute_url())
187         elif len(results) == 0:
188             form = PublishingSuggestForm(initial={"books": query + ", "})
189             return render_to_response('catalogue/search_no_hits.html',
190                                       {'tags': tag_list,
191                                        'prefix': query,
192                                        "form": form,
193                                        'did_you_mean': suggestion},
194                 context_instance=RequestContext(request))
195
196         return render_to_response('catalogue/search_multiple_hits.html',
197                                   {'tags': tag_list,
198                                    'prefix': query,
199                                    'results': { 'author': author_results,
200                                                 'title': title_results,
201                                                 'content': text_phrase,
202                                                 'other': everywhere},
203                                    'did_you_mean': suggestion},
204             context_instance=RequestContext(request))