Test for parenthood by looking for children instead of HTML.
[wolnelektury.git] / apps / search / views.py
index 881815d..dc9e27b 100644 (file)
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
 from django.conf import settings
 from django.shortcuts import render_to_response, get_object_or_404
 from django.template import RequestContext
 from django.conf import settings
 from django.shortcuts import render_to_response, get_object_or_404
 from django.template import RequestContext
-from django.contrib.auth.decorators import login_required
 from django.views.decorators import cache
 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
 from django.utils.translation import ugettext as _
 from django.views.decorators import cache
 from django.http import HttpResponse, HttpResponseRedirect, Http404, HttpResponsePermanentRedirect
 from django.utils.translation import ugettext as _
@@ -12,16 +13,12 @@ from catalogue.utils import split_tags
 from catalogue.models import Book, Tag, Fragment
 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 from catalogue.views import JSONResponse
 from catalogue.models import Book, Tag, Fragment
 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 from catalogue.views import JSONResponse
-from search import Search, JVM, SearchResult
-from lucene import StringReader
+from search import Search, SearchResult
 from suggest.forms import PublishingSuggestForm
 from suggest.forms import PublishingSuggestForm
-from time import sleep
 import re
 import re
-import enchant
+#import enchant
 import json
 
 import json
 
-dictionary = enchant.Dict('pl_PL')
-
 
 def match_word_re(word):
     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
 
 def match_word_re(word):
     if 'sqlite' in settings.DATABASES['default']['ENGINE']:
@@ -30,60 +27,47 @@ def match_word_re(word):
         return "[[:<:]]%s[[:>:]]" % word
 
 
         return "[[:<:]]%s[[:>:]]" % word
 
 
-def did_you_mean(query, tokens):
-    change = {}
-    for t in tokens:
-        authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
-        if len(authors) > 0:
-            continue
-
-        if not dictionary.check(t):
-            try:
-                change_to = dictionary.suggest(t)[0].lower()
-                if change_to != t.lower():
-                    change[t] = change_to
-            except IndexError:
-                pass
-
-    if change == {}:
-        return None
-
-    for frm, to in change.items():
-        query = query.replace(frm, to)
+query_syntax_chars = re.compile(r"[\\/*:(){}]")
 
 
-    return query
 
 
+def remove_query_syntax_chars(query, replace=' '):
+    return query_syntax_chars.sub(' ', query)
 
 
-JVM.attachCurrentThread()
-_search = None
 
 
+def did_you_mean(query, tokens):
+    return query
+    # change = {}
+    # for t in tokens:
+    #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
+    #     if len(authors) > 0:
+    #         continue
+
+    #     if False:
+    #         if not dictionary.check(t):
+    #             try:
+    #                 change_to = dictionary.suggest(t)[0].lower()
+    #                 if change_to != t.lower():
+    #                     change[t] = change_to
+    #             except IndexError:
+    #                 pass
 
 
-def get_search():
-    global _search
+    # if change == {}:
+    #     return None
 
 
-    while _search is False:
-        sleep(1)
+    # for frm, to in change.items():
+    #     query = query.replace(frm, to)
 
 
-    if _search is None:
-        _search = False
-        _search = Search()
-    return _search
+    # return query
 
 
 def hint(request):
     prefix = request.GET.get('term', '')
     if len(prefix) < 2:
         return JSONResponse([])
 
 
 def hint(request):
     prefix = request.GET.get('term', '')
     if len(prefix) < 2:
         return JSONResponse([])
-    JVM.attachCurrentThread()
 
 
-    search = get_search()
-    hint = search.hint()
-    try:
-        tags = request.GET.get('tags', '')
-        hint.tags(Tag.get_tag_list(tags))
-    except:
-        pass
+    prefix = remove_query_syntax_chars(prefix)
 
 
+    search = Search()
     # tagi beda ograniczac tutaj
     # ale tagi moga byc na ksiazce i na fragmentach
     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
     # tagi beda ograniczac tutaj
     # ale tagi moga byc na ksiazce i na fragmentach
     # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
@@ -92,7 +76,6 @@ def hint(request):
     tags = search.hint_tags(prefix, pdcounter=True)
     books = search.hint_books(prefix)
 
     tags = search.hint_tags(prefix, pdcounter=True)
     books = search.hint_books(prefix)
 
-    
     def is_dupe(tag):
         if isinstance(tag, PDCounterAuthor):
             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
     def is_dupe(tag):
         if isinstance(tag, PDCounterAuthor):
             if filter(lambda t: t.slug == tag.slug and t != tag, tags):
@@ -125,64 +108,56 @@ def hint(request):
                             content_type="application/json; charset=utf-8")
     else:
         return JSONResponse(data)
                             content_type="application/json; charset=utf-8")
     else:
         return JSONResponse(data)
-            
 
 
 def main(request):
     results = {}
 
 
 def main(request):
     results = {}
-    JVM.attachCurrentThread()  # where to put this?
 
     results = None
     query = None
 
     results = None
     query = None
-    fuzzy = False #0.8
-
-    query = request.GET.get('q','')
-    # book_id = request.GET.get('book', None)
-    # book = None
-    # if book_id is not None:
-    #     book = get_object_or_404(Book, id=book_id)
 
 
-    # hint = search.hint()
-    # try:
-    #     tag_list = Tag.get_tag_list(tags)
-    # except:
-    #     tag_list = []
+    query = request.GET.get('q', '')
 
     if len(query) < 2:
 
     if len(query) < 2:
-        return render_to_response('catalogue/search_too_short.html', {'prefix': query},
-                                  context_instance=RequestContext(request))
-
-    search = get_search()
-    # hint.tags(tag_list)
-    # if book:
-    #     hint.books(book)
-    tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
+        return render_to_response('catalogue/search_too_short.html',
+                                  {'prefix': query},
+            context_instance=RequestContext(request))
+
+    query = remove_query_syntax_chars(query)
+    
+    search = Search()
+
+    theme_terms = search.index.analyze(text=query, field="themes_pl") \
+        + search.index.analyze(text=query, field="themes")
+
+            # change hints
+    tags = search.hint_tags(query, pdcounter=True, prefix=False)
     tags = split_tags(tags)
 
     tags = split_tags(tags)
 
-    toks = StringReader(query)
-    tokens_cache = {}
+    author_results = search.search_phrase(query, 'authors', book=True)
+    translator_results = search.search_phrase(query, 'translators', book=True)
 
 
-    author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
-    title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
+    title_results = search.search_phrase(query, 'title', book=True)
 
     # Boost main author/title results with mixed search, and save some of its results for end of list.
     # boost author, title results
 
     # Boost main author/title results with mixed search, and save some of its results for end of list.
     # boost author, title results
-    author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
+    author_title_mixed = search.search_some(query, ['authors', 'translators', 'title', 'tags'], query_terms=theme_terms)
     author_title_rest = []
     author_title_rest = []
+
     for b in author_title_mixed:
     for b in author_title_mixed:
-        bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
-        for b2 in bks:
+        also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + translator_results + title_results)
+        for b2 in also_in_mixed:
             b2.boost *= 1.1
             b2.boost *= 1.1
-        if bks is []:
+        if also_in_mixed is []:
             author_title_rest.append(b)
 
     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
     # Because the query is using only one field.
     text_phrase = SearchResult.aggregate(
             author_title_rest.append(b)
 
     # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
     # Because the query is using only one field.
     text_phrase = SearchResult.aggregate(
-        search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
-        search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
+        search.search_phrase(query, 'text', snippets=True, book=False),
+        search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms))
 
 
-    everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
+    everywhere = search.search_everywhere(query, query_terms=theme_terms)
 
     def already_found(results):
         def f(e):
 
     def already_found(results):
         def f(e):
@@ -193,23 +168,25 @@ def main(request):
                     return True
             return False
         return f
                     return True
             return False
         return f
-    f = already_found(author_results + title_results + text_phrase)
+    f = already_found(author_results + translator_results + title_results + text_phrase)
     everywhere = filter(lambda x: not f(x), everywhere)
 
     author_results = SearchResult.aggregate(author_results)
     everywhere = filter(lambda x: not f(x), everywhere)
 
     author_results = SearchResult.aggregate(author_results)
+    translator_results = SearchResult.aggregate(translator_results)
     title_results = SearchResult.aggregate(title_results)
 
     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
 
     title_results = SearchResult.aggregate(title_results)
 
     everywhere = SearchResult.aggregate(everywhere, author_title_rest)
 
-    for res in [author_results, title_results, text_phrase, everywhere]:
+    for field, res in [('authors', author_results),
+                       ('translators', translator_results),
+                       ('title', title_results),
+                       ('text', text_phrase),
+                       ('text', everywhere)]:
         res.sort(reverse=True)
         for r in res:
         res.sort(reverse=True)
         for r in res:
-            for h in r.hits:
-                h['snippets'] = map(lambda s:
-                                    re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
-                                            re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
+            search.get_snippets(r, query, field, 3)
 
 
-    suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
+    suggestion = u''
 
     def ensure_exists(r):
         try:
 
     def ensure_exists(r):
         try:
@@ -218,22 +195,25 @@ def main(request):
             return False
 
     author_results = filter(ensure_exists, author_results)
             return False
 
     author_results = filter(ensure_exists, author_results)
+    translator_results = filter(ensure_exists, translator_results)
     title_results = filter(ensure_exists, title_results)
     text_phrase = filter(ensure_exists, text_phrase)
     everywhere = filter(ensure_exists, everywhere)
 
     title_results = filter(ensure_exists, title_results)
     text_phrase = filter(ensure_exists, text_phrase)
     everywhere = filter(ensure_exists, everywhere)
 
-    results = author_results + title_results + text_phrase + everywhere
+    results = author_results + translator_results + title_results + text_phrase + everywhere
     # ensure books do exists & sort them
     # ensure books do exists & sort them
-    results.sort(reverse=True)
-
-    if len(results) == 1:
-        fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
-        if len(fragment_hits) == 1:
-            #anchor = fragment_hits[0]['fragment']
-            #frag = Fragment.objects.get(anchor=anchor)
-            return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
-        return HttpResponseRedirect(results[0].book.get_absolute_url())
-    elif len(results) == 0:
+    for res in (author_results, translator_results, title_results, text_phrase, everywhere):
+        res.sort(reverse=True)
+
+    # We don't want to redirect to book text, but rather display result page even with one result.
+    # if len(results) == 1:
+    #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
+    #     if len(fragment_hits) == 1:
+    #         #anchor = fragment_hits[0]['fragment']
+    #         #frag = Fragment.objects.get(anchor=anchor)
+    #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
+    #     return HttpResponseRedirect(results[0].book.get_absolute_url())
+    if len(results) == 0:
         form = PublishingSuggestForm(initial={"books": query + ", "})
         return render_to_response('catalogue/search_no_hits.html',
                                   {'tags': tags,
         form = PublishingSuggestForm(initial={"books": query + ", "})
         return render_to_response('catalogue/search_no_hits.html',
                                   {'tags': tags,
@@ -245,9 +225,10 @@ def main(request):
     return render_to_response('catalogue/search_multiple_hits.html',
                               {'tags': tags,
                                'prefix': query,
     return render_to_response('catalogue/search_multiple_hits.html',
                               {'tags': tags,
                                'prefix': query,
-                               'results': { 'author': author_results,
-                                            'title': title_results,
-                                            'content': text_phrase,
-                                            'other': everywhere},
+                               'results': {'author': author_results,
+                                           'translator': translator_results,
+                                           'title': title_results,
+                                           'content': text_phrase,
+                                           'other': everywhere},
                                'did_you_mean': suggestion},
         context_instance=RequestContext(request))
                                'did_you_mean': suggestion},
         context_instance=RequestContext(request))