cover attribution in book text and checking script
[wolnelektury.git] / apps / search / views.py
index 2945c42..fd5883e 100644 (file)
@@ -10,11 +10,11 @@ from django.utils.translation import ugettext as _
 
 from catalogue.utils import split_tags
 from catalogue.models import Book, Tag, Fragment
 
 from catalogue.utils import split_tags
 from catalogue.models import Book, Tag, Fragment
-from catalogue.fields import dumps
 from catalogue.views import JSONResponse
 from search import Search, JVM, SearchResult
 from lucene import StringReader
 from suggest.forms import PublishingSuggestForm
 from catalogue.views import JSONResponse
 from search import Search, JVM, SearchResult
 from lucene import StringReader
 from suggest.forms import PublishingSuggestForm
+from time import sleep
 import re
 import enchant
 
 import re
 import enchant
 
@@ -51,8 +51,22 @@ def did_you_mean(query, tokens):
 
     return query
 
 
     return query
 
+
 JVM.attachCurrentThread()
 JVM.attachCurrentThread()
-search = Search()
+_search = None
+
+
+def get_search():
+    global _search
+
+    while _search is False:
+        sleep(1)
+
+    if _search is None:
+        _search = False
+        _search = Search()
+    return _search
+
 
 def hint(request):
     prefix = request.GET.get('term', '')
 
 def hint(request):
     prefix = request.GET.get('term', '')
@@ -60,6 +74,7 @@ def hint(request):
         return JSONResponse([])
     JVM.attachCurrentThread()
 
         return JSONResponse([])
     JVM.attachCurrentThread()
 
+    search = get_search()
     hint = search.hint()
     try:
         tags = request.GET.get('tags', '')
     hint = search.hint()
     try:
         tags = request.GET.get('tags', '')
@@ -77,7 +92,7 @@ def hint(request):
 
     def category_name(c):
         if c.startswith('pd_'):
 
     def category_name(c):
         if c.startswith('pd_'):
-            c=c[len('pd_'):]
+            c = c[len('pd_'):]
         return _(c)
 
     return JSONResponse(
         return _(c)
 
     return JSONResponse(
@@ -101,109 +116,118 @@ def main(request):
     query = None
     fuzzy = False #0.8
 
     query = None
     fuzzy = False #0.8
 
-    if 'q' in request.GET:
-        # tags = request.GET.get('tags', '')
-        query = request.GET['q']
-        # book_id = request.GET.get('book', None)
-        # book = None
-        # if book_id is not None:
-        #     book = get_object_or_404(Book, id=book_id)
-
-        # hint = search.hint()
-        # try:
-        #     tag_list = Tag.get_tag_list(tags)
-        # except:
-        #     tag_list = []
-
-        if len(query) < 2:
-            return render_to_response('catalogue/search_too_short.html', {'prefix': query},
-                                      context_instance=RequestContext(request))
-
-        # hint.tags(tag_list)
-        # if book:
-        #     hint.books(book)
-        tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
-        tags = split_tags(tags)
-
-        toks = StringReader(query)
-        tokens_cache = {}
-
-        author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
-        title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
-
-        # Boost main author/title results with mixed search, and save some of its results for end of list.
-        # boost author, title results
-        author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
-        author_title_rest = []
-        for b in author_title_mixed:
-            bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
-            for b2 in bks:
-                b2.boost *= 1.1
-            if bks is []:
-                author_title_rest.append(b)
-
-        # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
-        # Because the query is using only one field.
-        text_phrase = SearchResult.aggregate(
-            search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
-            search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
-
-        everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
-
-        def already_found(results):
-            def f(e):
-                for r in results:
-                    if e.book_id == r.book_id:
-                        e.boost = 0.9
-                        results.append(e)
-                        return True
-                return False
-            return f
-        f = already_found(author_results + title_results + text_phrase)
-        everywhere = filter(lambda x: not f(x), everywhere)
-
-        author_results = SearchResult.aggregate(author_results)
-        title_results = SearchResult.aggregate(title_results)
-
-        everywhere = SearchResult.aggregate(everywhere, author_title_rest)
-
-        for res in [author_results, title_results, text_phrase, everywhere]:
-            res.sort(reverse=True)
-            for r in res:
-                for h in r.hits:
-                    h['snippets'] = map(lambda s:
-                                        re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
-                                                re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
-
-        suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
-        print "dym? %s" % repr(suggestion).encode('utf-8')
-
-        results = author_results + title_results + text_phrase + everywhere
-        results.sort(reverse=True)
-
-        if len(results) == 1:
-            fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
-            if len(fragment_hits) == 1:
-                #anchor = fragment_hits[0]['fragment']
-                #frag = Fragment.objects.get(anchor=anchor)
-                return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
-            return HttpResponseRedirect(results[0].book.get_absolute_url())
-        elif len(results) == 0:
-            form = PublishingSuggestForm(initial={"books": query + ", "})
-            return render_to_response('catalogue/search_no_hits.html',
-                                      {'tags': tags,
-                                       'prefix': query,
-                                       "form": form,
-                                       'did_you_mean': suggestion},
-                context_instance=RequestContext(request))
-
-        print "TAGS: %s" % tags
-        return render_to_response('catalogue/search_multiple_hits.html',
+    query = request.GET.get('q','')
+    # book_id = request.GET.get('book', None)
+    # book = None
+    # if book_id is not None:
+    #     book = get_object_or_404(Book, id=book_id)
+
+    # hint = search.hint()
+    # try:
+    #     tag_list = Tag.get_tag_list(tags)
+    # except:
+    #     tag_list = []
+
+    if len(query) < 2:
+        return render_to_response('catalogue/search_too_short.html', {'prefix': query},
+                                  context_instance=RequestContext(request))
+
+    search = get_search()
+    # hint.tags(tag_list)
+    # if book:
+    #     hint.books(book)
+    tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
+    tags = split_tags(tags)
+
+    toks = StringReader(query)
+    tokens_cache = {}
+
+    author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
+    title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
+
+    # Boost main author/title results with mixed search, and save some of its results for end of list.
+    # boost author, title results
+    author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
+    author_title_rest = []
+    for b in author_title_mixed:
+        bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
+        for b2 in bks:
+            b2.boost *= 1.1
+        if bks is []:
+            author_title_rest.append(b)
+
+    # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
+    # Because the query is using only one field.
+    text_phrase = SearchResult.aggregate(
+        search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
+        search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
+
+    everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
+
+    def already_found(results):
+        def f(e):
+            for r in results:
+                if e.book_id == r.book_id:
+                    e.boost = 0.9
+                    results.append(e)
+                    return True
+            return False
+        return f
+    f = already_found(author_results + title_results + text_phrase)
+    everywhere = filter(lambda x: not f(x), everywhere)
+
+    author_results = SearchResult.aggregate(author_results)
+    title_results = SearchResult.aggregate(title_results)
+
+    everywhere = SearchResult.aggregate(everywhere, author_title_rest)
+
+    for res in [author_results, title_results, text_phrase, everywhere]:
+        res.sort(reverse=True)
+        for r in res:
+            for h in r.hits:
+                h['snippets'] = map(lambda s:
+                                    re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
+                                            re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
+
+    suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
+
+    def ensure_exists(r):
+        try:
+            return r.book
+        except Book.DoesNotExist:
+            return False
+
+    author_results = filter(ensure_exists, author_results)
+    title_results = filter(ensure_exists, title_results)
+    text_phrase = filter(ensure_exists, text_phrase)
+    everywhere = filter(ensure_exists, everywhere)
+
+    results = author_results + title_results + text_phrase + everywhere
+    # ensure books do exists & sort them
+    results.sort(reverse=True)
+
+    if len(results) == 1:
+        fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
+        if len(fragment_hits) == 1:
+            #anchor = fragment_hits[0]['fragment']
+            #frag = Fragment.objects.get(anchor=anchor)
+            return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
+        return HttpResponseRedirect(results[0].book.get_absolute_url())
+    elif len(results) == 0:
+        form = PublishingSuggestForm(initial={"books": query + ", "})
+        return render_to_response('catalogue/search_no_hits.html',
                                   {'tags': tags,
                                    'prefix': query,
                                   {'tags': tags,
                                    'prefix': query,
-                                   'results': { 'author': author_results,
-                                                'title': title_results,
-                                                'content': text_phrase,
-                                                'other': everywhere},
+                                   "form": form,
                                    'did_you_mean': suggestion},
             context_instance=RequestContext(request))
                                    'did_you_mean': suggestion},
             context_instance=RequestContext(request))
+
+    return render_to_response('catalogue/search_multiple_hits.html',
+                              {'tags': tags,
+                               'prefix': query,
+                               'results': { 'author': author_results,
+                                            'title': title_results,
+                                            'content': text_phrase,
+                                            'other': everywhere},
+                               'did_you_mean': suggestion},
+        context_instance=RequestContext(request))