optimizeindex removes documents for deleted books
[wolnelektury.git] / apps / search / views.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from django.shortcuts import render_to_response, get_object_or_404
5 from django.template import RequestContext
6 from django.contrib.auth.decorators import login_required
7 from django.views.decorators import cache
8 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
9 from django.utils.translation import ugettext as _
10
11 from catalogue.utils import split_tags
12 from catalogue.models import Book, Tag, Fragment
13 from catalogue.views import JSONResponse
14 from search import Search, JVM, SearchResult
15 from lucene import StringReader
16 from suggest.forms import PublishingSuggestForm
17 from time import sleep
18 import re
19 import enchant
20
21 dictionary = enchant.Dict('pl_PL')
22
23
24 def match_word_re(word):
25     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
26         return r"\b%s\b" % word
27     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
28         return "[[:<:]]%s[[:>:]]" % word
29
30
31 def did_you_mean(query, tokens):
32     change = {}
33     for t in tokens:
34         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
35         if len(authors) > 0:
36             continue
37
38         if not dictionary.check(t):
39             try:
40                 change_to = dictionary.suggest(t)[0].lower()
41                 if change_to != t.lower():
42                     change[t] = change_to
43             except IndexError:
44                 pass
45
46     if change == {}:
47         return None
48
49     for frm, to in change.items():
50         query = query.replace(frm, to)
51
52     return query
53
54
55 JVM.attachCurrentThread()
56 _search = None
57
58
59 def get_search():
60     global _search
61
62     while _search is False:
63         sleep(1)
64
65     if _search is None:
66         _search = False
67         _search = Search()
68     return _search
69
70
71 def hint(request):
72     prefix = request.GET.get('term', '')
73     if len(prefix) < 2:
74         return JSONResponse([])
75     JVM.attachCurrentThread()
76
77     search = get_search()
78     hint = search.hint()
79     try:
80         tags = request.GET.get('tags', '')
81         hint.tags(Tag.get_tag_list(tags))
82     except:
83         pass
84
85     # tagi beda ograniczac tutaj
86     # ale tagi moga byc na ksiazce i na fragmentach
87     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
88     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
89
90     tags = search.hint_tags(prefix, pdcounter=True)
91     books = search.hint_books(prefix)
92
93     def category_name(c):
94         if c.startswith('pd_'):
95             c = c[len('pd_'):]
96         return _(c)
97
98     return JSONResponse(
99         [{'label': t.name,
100           'category': category_name(t.category),
101           'id': t.id,
102           'url': t.get_absolute_url()}
103           for t in tags] + \
104           [{'label': b.title,
105             'category': _('book'),
106             'id': b.id,
107             'url': b.get_absolute_url()}
108             for b in books])
109
110
111 def main(request):
112     results = {}
113     JVM.attachCurrentThread()  # where to put this?
114
115     results = None
116     query = None
117     fuzzy = False #0.8
118
119     query = request.GET.get('q','')
120     # book_id = request.GET.get('book', None)
121     # book = None
122     # if book_id is not None:
123     #     book = get_object_or_404(Book, id=book_id)
124
125     # hint = search.hint()
126     # try:
127     #     tag_list = Tag.get_tag_list(tags)
128     # except:
129     #     tag_list = []
130
131     if len(query) < 2:
132         return render_to_response('catalogue/search_too_short.html', {'prefix': query},
133                                   context_instance=RequestContext(request))
134
135     search = get_search()
136     # hint.tags(tag_list)
137     # if book:
138     #     hint.books(book)
139     tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
140     tags = split_tags(tags)
141
142     toks = StringReader(query)
143     tokens_cache = {}
144
145     author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
146     title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
147
148     # Boost main author/title results with mixed search, and save some of its results for end of list.
149     # boost author, title results
150     author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
151     author_title_rest = []
152     for b in author_title_mixed:
153         bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
154         for b2 in bks:
155             b2.boost *= 1.1
156         if bks is []:
157             author_title_rest.append(b)
158
159     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
160     # Because the query is using only one field.
161     text_phrase = SearchResult.aggregate(
162         search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
163         search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
164
165     everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
166
167     def already_found(results):
168         def f(e):
169             for r in results:
170                 if e.book_id == r.book_id:
171                     e.boost = 0.9
172                     results.append(e)
173                     return True
174             return False
175         return f
176     f = already_found(author_results + title_results + text_phrase)
177     everywhere = filter(lambda x: not f(x), everywhere)
178
179     author_results = SearchResult.aggregate(author_results)
180     title_results = SearchResult.aggregate(title_results)
181
182     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
183
184     for res in [author_results, title_results, text_phrase, everywhere]:
185         res.sort(reverse=True)
186         for r in res:
187             for h in r.hits:
188                 h['snippets'] = map(lambda s:
189                                     re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
190                                             re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
191
192     suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
193
194     def ensure_exists(r):
195         try:
196             return r.book
197         except Book.DoesNotExist:
198             return False
199
200     author_results = filter(ensure_exists, author_results)
201     title_results = filter(ensure_exists, title_results)
202     text_phrase = filter(ensure_exists, text_phrase)
203     everywhere = filter(ensure_exists, everywhere)
204
205     results = author_results + title_results + text_phrase + everywhere
206     # ensure books do exists & sort them
207     results.sort(reverse=True)
208
209     if len(results) == 1:
210         fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
211         if len(fragment_hits) == 1:
212             #anchor = fragment_hits[0]['fragment']
213             #frag = Fragment.objects.get(anchor=anchor)
214             return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
215         return HttpResponseRedirect(results[0].book.get_absolute_url())
216     elif len(results) == 0:
217         form = PublishingSuggestForm(initial={"books": query + ", "})
218         return render_to_response('catalogue/search_no_hits.html',
219                                   {'tags': tags,
220                                    'prefix': query,
221                                    "form": form,
222                                    'did_you_mean': suggestion},
223             context_instance=RequestContext(request))
224
225     return render_to_response('catalogue/search_multiple_hits.html',
226                               {'tags': tags,
227                                'prefix': query,
228                                'results': { 'author': author_results,
229                                             'title': title_results,
230                                             'content': text_phrase,
231                                             'other': everywhere},
232                                'did_you_mean': suggestion},
233         context_instance=RequestContext(request))