librarian bump
[wolnelektury.git] / apps / search / views.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from django.shortcuts import render_to_response, get_object_or_404
5 from django.template import RequestContext
6 from django.contrib.auth.decorators import login_required
7 from django.views.decorators import cache
8 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
9 from django.utils.translation import ugettext as _
10
11 from catalogue.utils import split_tags
12 from catalogue.models import Book, Tag, Fragment
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from catalogue.views import JSONResponse
15 from search import Search, JVM, SearchResult
16 from lucene import StringReader
17 from suggest.forms import PublishingSuggestForm
18 from time import sleep
19 import re
20 #import enchant
21 import json
22
23 #dictionary = enchant.Dict('en_US')
24
25
26 def match_word_re(word):
27     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
28         return r"\b%s\b" % word
29     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
30         return "[[:<:]]%s[[:>:]]" % word
31
32
33 def did_you_mean(query, tokens):
34     change = {}
35     for t in tokens:
36         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
37         if len(authors) > 0:
38             continue
39
40         if False:
41             if not dictionary.check(t):
42                 try:
43                     change_to = dictionary.suggest(t)[0].lower()
44                     if change_to != t.lower():
45                         change[t] = change_to
46                 except IndexError:
47                     pass
48
49     if change == {}:
50         return None
51
52     for frm, to in change.items():
53         query = query.replace(frm, to)
54
55     return query
56
57
58 JVM.attachCurrentThread()
59 _search = None
60
61
62 def get_search():
63     global _search
64
65     while _search is False:
66         sleep(1)
67
68     if _search is None:
69         _search = False
70         _search = Search()
71     return _search
72
73
74 def hint(request):
75     prefix = request.GET.get('term', '')
76     if len(prefix) < 2:
77         return JSONResponse([])
78     JVM.attachCurrentThread()
79
80     search = get_search()
81     hint = search.hint()
82     try:
83         tags = request.GET.get('tags', '')
84         hint.tags(Tag.get_tag_list(tags))
85     except:
86         pass
87
88     # tagi beda ograniczac tutaj
89     # ale tagi moga byc na ksiazce i na fragmentach
90     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
91     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
92
93     tags = search.hint_tags(prefix, pdcounter=True)
94     books = search.hint_books(prefix)
95
96     
97     def is_dupe(tag):
98         if isinstance(tag, PDCounterAuthor):
99             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
100                 return True
101         elif isinstance(tag, PDCounterBook):
102             if filter(lambda b: b.slug == tag.slug, tags):
103                 return True
104         return False
105
106     tags = filter(lambda t: not is_dupe(t), tags)
107
108     def category_name(c):
109         if c.startswith('pd_'):
110             c = c[len('pd_'):]
111         return _(c)
112
113     callback = request.GET.get('callback', None)
114     data = [{'label': t.name,
115               'category': category_name(t.category),
116               'id': t.id,
117               'url': t.get_absolute_url()}
118               for t in tags] + \
119               [{'label': b.title,
120                 'category': _('book'),
121                 'id': b.id,
122                 'url': b.get_absolute_url()}
123                 for b in books]
124     if callback:
125         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
126                             content_type="application/json; charset=utf-8")
127     else:
128         return JSONResponse(data)
129             
130
131
132 def main(request):
133     results = {}
134     JVM.attachCurrentThread()  # where to put this?
135
136     results = None
137     query = None
138     fuzzy = False #0.8
139
140     query = request.GET.get('q','')
141     # book_id = request.GET.get('book', None)
142     # book = None
143     # if book_id is not None:
144     #     book = get_object_or_404(Book, id=book_id)
145
146     # hint = search.hint()
147     # try:
148     #     tag_list = Tag.get_tag_list(tags)
149     # except:
150     #     tag_list = []
151
152     if len(query) < 2:
153         return render_to_response('catalogue/search_too_short.html', {'prefix': query},
154                                   context_instance=RequestContext(request))
155
156     search = get_search()
157     # hint.tags(tag_list)
158     # if book:
159     #     hint.books(book)
160     tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
161     tags = split_tags(tags)
162
163     toks = StringReader(query)
164     tokens_cache = {}
165
166     author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
167     title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
168
169     # Boost main author/title results with mixed search, and save some of its results for end of list.
170     # boost author, title results
171     author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
172     author_title_rest = []
173     for b in author_title_mixed:
174         bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
175         for b2 in bks:
176             b2.boost *= 1.1
177         if bks is []:
178             author_title_rest.append(b)
179
180     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
181     # Because the query is using only one field.
182     text_phrase = SearchResult.aggregate(
183         search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
184         search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
185
186     everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
187
188     def already_found(results):
189         def f(e):
190             for r in results:
191                 if e.book_id == r.book_id:
192                     e.boost = 0.9
193                     results.append(e)
194                     return True
195             return False
196         return f
197     f = already_found(author_results + title_results + text_phrase)
198     everywhere = filter(lambda x: not f(x), everywhere)
199
200     author_results = SearchResult.aggregate(author_results)
201     title_results = SearchResult.aggregate(title_results)
202
203     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
204
205     for res in [author_results, title_results, text_phrase, everywhere]:
206         res.sort(reverse=True)
207         for r in res:
208             for h in r.hits:
209                 h['snippets'] = map(lambda s:
210                                     re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
211                                             re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
212
213     suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
214
215     def ensure_exists(r):
216         try:
217             return r.book
218         except Book.DoesNotExist:
219             return False
220
221     author_results = filter(ensure_exists, author_results)
222     title_results = filter(ensure_exists, title_results)
223     text_phrase = filter(ensure_exists, text_phrase)
224     everywhere = filter(ensure_exists, everywhere)
225
226     results = author_results + title_results + text_phrase + everywhere
227     # ensure books do exists & sort them
228     results.sort(reverse=True)
229
230     if len(results) == 1:
231         fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
232         if len(fragment_hits) == 1:
233             #anchor = fragment_hits[0]['fragment']
234             #frag = Fragment.objects.get(anchor=anchor)
235             return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
236         return HttpResponseRedirect(results[0].book.get_absolute_url())
237     elif len(results) == 0:
238         form = PublishingSuggestForm(initial={"books": query + ", "})
239         return render_to_response('catalogue/search_no_hits.html',
240                                   {'tags': tags,
241                                    'prefix': query,
242                                    "form": form,
243                                    'did_you_mean': suggestion},
244             context_instance=RequestContext(request))
245
246     return render_to_response('catalogue/search_multiple_hits.html',
247                               {'tags': tags,
248                                'prefix': query,
249                                'results': { 'author': author_results,
250                                             'title': title_results,
251                                             'content': text_phrase,
252                                             'other': everywhere},
253                                'did_you_mean': suggestion},
254         context_instance=RequestContext(request))