2d8838ccf20e06654fc4e22d4c36a73224c7792e
[wolnelektury.git] / apps / search / views.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from django.shortcuts import render_to_response, get_object_or_404
5 from django.template import RequestContext
6 from django.contrib.auth.decorators import login_required
7 from django.views.decorators import cache
8 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
9 from django.utils.translation import ugettext as _
10
11 from catalogue.utils import split_tags
12 from catalogue.models import Book, Tag, Fragment
13 from catalogue.fields import dumps
14 from catalogue.views import JSONResponse
15 from search import Search, JVM, SearchResult
16 from lucene import StringReader
17 from suggest.forms import PublishingSuggestForm
18 import re
19 import enchant
20
21 dictionary = enchant.Dict('pl_PL')
22
23
24 def match_word_re(word):
25     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
26         return r"\b%s\b" % word
27     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
28         return "[[:<:]]%s[[:>:]]" % word
29
30
31 def did_you_mean(query, tokens):
32     change = {}
33     for t in tokens:
34         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
35         if len(authors) > 0:
36             continue
37
38         if not dictionary.check(t):
39             try:
40                 change_to = dictionary.suggest(t)[0].lower()
41                 if change_to != t.lower():
42                     change[t] = change_to
43             except IndexError:
44                 pass
45
46     if change == {}:
47         return None
48
49     for frm, to in change.items():
50         query = query.replace(frm, to)
51
52     return query
53
54
55 def hint(request):
56     prefix = request.GET.get('term', '')
57     if len(prefix) < 2:
58         return JSONResponse([])
59     JVM.attachCurrentThread()
60     s = Search()
61
62     hint = s.hint()
63     try:
64         tags = request.GET.get('tags', '')
65         hint.tags(Tag.get_tag_list(tags))
66     except:
67         pass
68
69     # tagi beda ograniczac tutaj
70     # ale tagi moga byc na ksiazce i na fragmentach
71     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
72     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
73
74     tags = s.hint_tags(prefix, pdcounter=True)
75     books = s.hint_books(prefix)
76
77     def category_name(c):
78         if c.startswith('pd_'):
79             c=c[len('pd_'):]
80             return _(c)
81
82     return JSONResponse(
83         [{'label': t.name,
84           'category': category_name(t.category),
85           'id': t.id,
86           'url': t.get_absolute_url()}
87           for t in tags] + \
88           [{'label': b.title,
89             'category': _('book'),
90             'id': b.id,
91             'url': b.get_absolute_url()}
92             for b in books])
93
94
95 def main(request):
96     results = {}
97     JVM.attachCurrentThread()  # where to put this?
98     srch = Search()
99
100     results = None
101     query = None
102     fuzzy = False #0.8
103
104     if 'q' in request.GET:
105         # tags = request.GET.get('tags', '')
106         query = request.GET['q']
107         # book_id = request.GET.get('book', None)
108         # book = None
109         # if book_id is not None:
110         #     book = get_object_or_404(Book, id=book_id)
111
112         # hint = srch.hint()
113         # try:
114         #     tag_list = Tag.get_tag_list(tags)
115         # except:
116         #     tag_list = []
117
118         if len(query) < 2:
119             return render_to_response('catalogue/search_too_short.html', {'prefix': query},
120                                       context_instance=RequestContext(request))
121
122         # hint.tags(tag_list)
123         # if book:
124         #     hint.books(book)
125         tags = srch.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
126         tags = split_tags(tags)
127
128         toks = StringReader(query)
129         tokens_cache = {}
130
131         author_results = srch.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
132         title_results = srch.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
133
134         # Boost main author/title results with mixed search, and save some of its results for end of list.
135         # boost author, title results
136         author_title_mixed = srch.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
137         author_title_rest = []
138         for b in author_title_mixed:
139             bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
140             for b2 in bks:
141                 b2.boost *= 1.1
142             if bks is []:
143                 author_title_rest.append(b)
144
145         # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
146         # Because the query is using only one field.
147         text_phrase = SearchResult.aggregate(
148             srch.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
149             srch.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
150
151         everywhere = srch.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
152
153         def already_found(results):
154             def f(e):
155                 for r in results:
156                     if e.book_id == r.book_id:
157                         e.boost = 0.9
158                         results.append(e)
159                         return True
160                 return False
161             return f
162         f = already_found(author_results + title_results + text_phrase)
163         everywhere = filter(lambda x: not f(x), everywhere)
164
165         author_results = SearchResult.aggregate(author_results)
166         title_results = SearchResult.aggregate(title_results)
167
168         everywhere = SearchResult.aggregate(everywhere, author_title_rest)
169
170         for res in [author_results, title_results, text_phrase, everywhere]:
171             res.sort(reverse=True)
172             for r in res:
173                 for h in r.hits:
174                     h['snippets'] = map(lambda s:
175                                         re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
176                                                 re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
177
178         suggestion = did_you_mean(query, srch.get_tokens(toks, field="SIMPLE"))
179         print "dym? %s" % repr(suggestion).encode('utf-8')
180
181         results = author_results + title_results + text_phrase + everywhere
182         results.sort(reverse=True)
183
184         if len(results) == 1:
185             fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
186             if len(fragment_hits) == 1:
187                 #anchor = fragment_hits[0]['fragment']
188                 #frag = Fragment.objects.get(anchor=anchor)
189                 return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
190             return HttpResponseRedirect(results[0].book.get_absolute_url())
191         elif len(results) == 0:
192             form = PublishingSuggestForm(initial={"books": query + ", "})
193             return render_to_response('catalogue/search_no_hits.html',
194                                       {'tags': tags,
195                                        'prefix': query,
196                                        "form": form,
197                                        'did_you_mean': suggestion},
198                 context_instance=RequestContext(request))
199
200         print "TAGS: %s" % tags
201         return render_to_response('catalogue/search_multiple_hits.html',
202                                   {'tags': tags,
203                                    'prefix': query,
204                                    'results': { 'author': author_results,
205                                                 'title': title_results,
206                                                 'content': text_phrase,
207                                                 'other': everywhere},
208                                    'did_you_mean': suggestion},
209             context_instance=RequestContext(request))