apps/search/views.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4 from django.shortcuts import render_to_response, get_object_or_404
   5 from django.template import RequestContext
   6 from django.contrib.auth.decorators import login_required
   7 from django.views.decorators import cache
   8 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
   9 from django.utils.translation import ugettext as _
  10
  11 from catalogue.utils import split_tags
  12 from catalogue.models import Book, Tag, Fragment
  13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  14 from catalogue.views import JSONResponse
  15 from search import Search, JVM, SearchResult
  16 from lucene import StringReader
  17 from suggest.forms import PublishingSuggestForm
  18 from time import sleep
  19 import re
  20 #import enchant
  21 import json
  22
  23 #dictionary = enchant.Dict('en_US')
  24
  25
  26 def match_word_re(word):
  27     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
  28         return r"\b%s\b" % word
  29     elif 'mysql' in settings.DATABASES['default']['ENGINE']:
  30         return "[[:<:]]%s[[:>:]]" % word
  31
  32
  33 def did_you_mean(query, tokens):
  34     change = {}
  35     for t in tokens:
  36         authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
  37         if len(authors) > 0:
  38             continue
  39
  40         if False:
  41             if not dictionary.check(t):
  42                 try:
  43                     change_to = dictionary.suggest(t)[0].lower()
  44                     if change_to != t.lower():
  45                         change[t] = change_to
  46                 except IndexError:
  47                     pass
  48
  49     if change == {}:
  50         return None
  51
  52     for frm, to in change.items():
  53         query = query.replace(frm, to)
  54
  55     return query
  56
  57
  58 JVM.attachCurrentThread()
  59 _search = None
  60
  61
  62 def get_search():
  63     global _search
  64
  65     while _search is False:
  66         sleep(1)
  67
  68     if _search is None:
  69         _search = False
  70         _search = Search()
  71     return _search
  72
  73
  74 def hint(request):
  75     prefix = request.GET.get('term', '')
  76     if len(prefix) < 2:
  77         return JSONResponse([])
  78     JVM.attachCurrentThread()
  79
  80     search = get_search()
  81     hint = search.hint()
  82     try:
  83         tags = request.GET.get('tags', '')
  84         hint.tags(Tag.get_tag_list(tags))
  85     except:
  86         pass
  87
  88     # tagi beda ograniczac tutaj
  89     # ale tagi moga byc na ksiazce i na fragmentach
  90     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
  91     # jesli zas dotycza themes, to wazne, zeby byly w tym samym fragmencie.
  92
  93     tags = search.hint_tags(prefix, pdcounter=True)
  94     books = search.hint_books(prefix)
  95
  96
  97     def is_dupe(tag):
  98         if isinstance(tag, PDCounterAuthor):
  99             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
 100                 return True
 101         elif isinstance(tag, PDCounterBook):
 102             if filter(lambda b: b.slug == tag.slug, tags):
 103                 return True
 104         return False
 105
 106     tags = filter(lambda t: not is_dupe(t), tags)
 107
 108     def category_name(c):
 109         if c.startswith('pd_'):
 110             c = c[len('pd_'):]
 111         return _(c)
 112
 113     callback = request.GET.get('callback', None)
 114     data = [{'label': t.name,
 115               'category': category_name(t.category),
 116               'id': t.id,
 117               'url': t.get_absolute_url()}
 118               for t in tags] + \
 119               [{'label': b.title,
 120                 'category': _('book'),
 121                 'id': b.id,
 122                 'url': b.get_absolute_url()}
 123                 for b in books]
 124     if callback:
 125         return HttpResponse("%s(%s);" % (callback, json.dumps(data)),
 126                             content_type="application/json; charset=utf-8")
 127     else:
 128         return JSONResponse(data)
 129
 130
 131
 132 def main(request):
 133     results = {}
 134     JVM.attachCurrentThread()  # where to put this?
 135
 136     results = None
 137     query = None
 138     fuzzy = False #0.8
 139
 140     query = request.GET.get('q','')
 141     # book_id = request.GET.get('book', None)
 142     # book = None
 143     # if book_id is not None:
 144     #     book = get_object_or_404(Book, id=book_id)
 145
 146     # hint = search.hint()
 147     # try:
 148     #     tag_list = Tag.get_tag_list(tags)
 149     # except:
 150     #     tag_list = []
 151
 152     if len(query) < 2:
 153         return render_to_response('catalogue/search_too_short.html', {'prefix': query},
 154                                   context_instance=RequestContext(request))
 155
 156     search = get_search()
 157     # hint.tags(tag_list)
 158     # if book:
 159     #     hint.books(book)
 160     tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
 161     tags = split_tags(tags)
 162
 163     toks = StringReader(query)
 164     tokens_cache = {}
 165
 166     author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
 167     title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
 168
 169     # Boost main author/title results with mixed search, and save some of its results for end of list.
 170     # boost author, title results
 171     author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
 172     author_title_rest = []
 173     for b in author_title_mixed:
 174         bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
 175         for b2 in bks:
 176             b2.boost *= 1.1
 177         if bks is []:
 178             author_title_rest.append(b)
 179
 180     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
 181     # Because the query is using only one field.
 182     text_phrase = SearchResult.aggregate(
 183         search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
 184         search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
 185
 186     everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
 187
 188     def already_found(results):
 189         def f(e):
 190             for r in results:
 191                 if e.book_id == r.book_id:
 192                     e.boost = 0.9
 193                     results.append(e)
 194                     return True
 195             return False
 196         return f
 197     f = already_found(author_results + title_results + text_phrase)
 198     everywhere = filter(lambda x: not f(x), everywhere)
 199
 200     author_results = SearchResult.aggregate(author_results)
 201     title_results = SearchResult.aggregate(title_results)
 202
 203     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
 204
 205     for res in [author_results, title_results, text_phrase, everywhere]:
 206         res.sort(reverse=True)
 207         for r in res:
 208             for h in r.hits:
 209                 h['snippets'] = map(lambda s:
 210                                     re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
 211                                             re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
 212
 213     suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
 214
 215     def ensure_exists(r):
 216         try:
 217             return r.book
 218         except Book.DoesNotExist:
 219             return False
 220
 221     author_results = filter(ensure_exists, author_results)
 222     title_results = filter(ensure_exists, title_results)
 223     text_phrase = filter(ensure_exists, text_phrase)
 224     everywhere = filter(ensure_exists, everywhere)
 225
 226     results = author_results + title_results + text_phrase + everywhere
 227     # ensure books do exists & sort them
 228     results.sort(reverse=True)
 229
 230     if len(results) == 1:
 231         fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
 232         if len(fragment_hits) == 1:
 233             #anchor = fragment_hits[0]['fragment']
 234             #frag = Fragment.objects.get(anchor=anchor)
 235             return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
 236         return HttpResponseRedirect(results[0].book.get_absolute_url())
 237     elif len(results) == 0:
 238         form = PublishingSuggestForm(initial={"books": query + ", "})
 239         return render_to_response('catalogue/search_no_hits.html',
 240                                   {'tags': tags,
 241                                    'prefix': query,
 242                                    "form": form,
 243                                    'did_you_mean': suggestion},
 244             context_instance=RequestContext(request))
 245
 246     return render_to_response('catalogue/search_multiple_hits.html',
 247                               {'tags': tags,
 248                                'prefix': query,
 249                                'results': { 'author': author_results,
 250                                             'title': title_results,
 251                                             'content': text_phrase,
 252                                             'other': everywhere},
 253                                'did_you_mean': suggestion},
 254         context_instance=RequestContext(request))