fixes on staging
authorMarcin Koziej <marcin@lolownia.org>
Wed, 1 Feb 2012 12:06:41 +0000 (13:06 +0100)
committerMarcin Koziej <marcin@lolownia.org>
Wed, 1 Feb 2012 12:06:41 +0000 (13:06 +0100)
apps/search/index.py
apps/search/management/commands/reindex.py
apps/search/management/commands/snippets.py [new file with mode: 0755]
apps/search/views.py
wolnelektury/static/css/book_box.css

index 97145d3..8ea3124 100644 (file)
@@ -1212,17 +1212,25 @@ class Search(IndexStore):
         if position is None or length is None:
             return None
         # locate content.
-        snippets = Snippets(stored.get('book_id')).open()
+        book_id = int(stored.get('book_id'))
+        snippets = Snippets(book_id).open()
         try:
-            text = snippets.get((int(position),
-                                 int(length)))
-        finally:
-            snippets.close()
+            try:
+                text = snippets.get((int(position),
+                                     int(length)))
+            finally:
+                snippets.close()
 
-        tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
-        #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
-        snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+            tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
+            #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
+            snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 
+        except Exception, e:
+            e2 = e
+            if hasattr(e, 'getJavaException'):
+                e2 = unicode(e.getJavaException())
+            raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
+                e2)
         return snip
 
     @staticmethod
@@ -1302,7 +1310,7 @@ class Search(IndexStore):
 
         return only_term
 
-    def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
+    def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
         """
         Return auto-complete hints for tags
         using prefix search.
@@ -1314,14 +1322,14 @@ class Search(IndexStore):
             if prefix:
                 q = self.make_prefix_phrase(toks, field)
             else:
-                q = self.make_term_query(toks, field)
+                q = self.make_term_query(toks, field, fuzzy=fuzzy)
             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
 
         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
 
         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
 
-    def hint_books(self, string, max_results=50, prefix=True):
+    def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
         """
         Returns auto-complete hints for book titles
         Because we do not index 'pseudo' title-tags.
@@ -1332,7 +1340,7 @@ class Search(IndexStore):
         if prefix:
             q = self.make_prefix_phrase(toks, 'title')
         else:
-            q = self.make_term_query(toks, 'title')
+            q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
 
         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
 
index bce4708..8901102 100755 (executable)
@@ -1,9 +1,14 @@
 from django.core.management.base import BaseCommand
 
+from optparse import make_option
 class Command(BaseCommand):
     help = 'Reindex everything.'
     args = ''
-
+    
+    option_list = BaseCommand.option_list + (
+        make_option('-n', '--book-id', action='store_true', dest='book_id', default=False,
+            help='book id'),
+    )
     def handle(self, *args, **opts):
         from catalogue.models import Book
         import search
@@ -13,13 +18,16 @@ class Command(BaseCommand):
         if args:
             books = []
             for a in args:
-                books += Book.objects.filter(slug=a).all()
+                if opts['book_id']:
+                    books += Book.objects.filter(id=int(a)).all()
+                else:
+                    books += Book.objects.filter(slug=a).all()
         else:
             books = Book.objects.all()
             
         for b in books:
             print b.title
-            idx.index_book(b, None)
+            idx.index_book(b)
         print 'Reindexing tags.'
         idx.index_tags()
         idx.close()
diff --git a/apps/search/management/commands/snippets.py b/apps/search/management/commands/snippets.py
new file mode 100755 (executable)
index 0000000..058ea05
--- /dev/null
@@ -0,0 +1,60 @@
+from django.core.management.base import BaseCommand
+
+from glob import glob
+from optparse import make_option
+from os import path
+from sys import stdout
+from django.conf import settings
+
+class Command(BaseCommand):
+    help = 'Reindex everything.'
+    args = ''
+
+    option_list = BaseCommand.option_list + (
+        make_option('-C', '--check-just-read', action='store_true', dest='check', default=False,
+            help='Check snippets utf-8'),
+        make_option('-c', '--check', action='store_true', dest='check2', default=False,
+            help='Check snippets utf-8 by walking through index'),
+        )
+
+
+    def handle(self, *args, **opts):
+        from catalogue.models import Book
+        import search
+
+        if opts['check']:
+            sfn = glob(settings.SEARCH_INDEX+'snippets/*')
+            print sfn
+            for fn in sfn:
+                print fn
+                bkid = int(path.basename(fn))
+                with open(fn) as f:
+                    cont = f.read()
+                    try:
+                        uc = cont.decode('utf-8')
+                    except UnicodeDecodeError, ude:
+                        print "error in snippets %d" % bkid
+        if opts['check2']:
+            s = search.Search()
+            reader = s.searcher.getIndexReader()
+            numdocs = reader.numDocs()
+            for did in range(numdocs):
+                doc = reader.document(did)
+                if doc and doc.get('book_id'):
+                    bkid = int(doc.get('book_id'))
+                    #import pdb; pdb.set_trace()
+                    stdout.write("\r%d / %d" % (did, numdocs))
+                    stdout.flush()
+                    ss  = doc.get('snippet_position')
+                    sl  = doc.get('snippet_length')
+                    if ss and sl:
+                        snips = Snippets(bkid)
+                        try:
+                            txt = snips.get((ss,sl))
+                            assert len(txt) == sl
+                        except UnicodeDecodeError, ude:
+                            stdout.write("\nerror in snippets %d\n" % bkid)
+                            raise ude
+
+            stdout.write("\ndone.\n")
+
index 623b311..de2337f 100644 (file)
@@ -37,7 +37,9 @@ def did_you_mean(query, tokens):
 
         if not dictionary.check(t):
             try:
-                change[t] = dictionary.suggest(t)[0]
+                change_to = dictionary.suggest(t)[0].lower()
+                if change_to != t.lower():
+                    change[t] = change_to
             except IndexError:
                 pass
 
@@ -94,7 +96,7 @@ def main(request):
 
     results = None
     query = None
-    fuzzy = False
+    fuzzy = False #0.8
 
     if 'q' in request.GET:
         # tags = request.GET.get('tags', '')
@@ -117,14 +119,11 @@ def main(request):
         # hint.tags(tag_list)
         # if book:
         #     hint.books(book)
-        tags = srch.hint_tags(query, pdcounter=True, prefix=False)
+        tags = srch.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
         tags = split_tags(tags)
 
         toks = StringReader(query)
         tokens_cache = {}
-        fuzzy = 'fuzzy' in request.GET
-        if fuzzy:
-            fuzzy = 0.7
 
         author_results = srch.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
         title_results = srch.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
@@ -182,9 +181,9 @@ def main(request):
         if len(results) == 1:
             fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
             if len(fragment_hits) == 1:
-                anchor = fragment_hits[0]['fragment']
-                frag = Fragment.objects.get(anchor=anchor)
-                return HttpResponseRedirect(frag.get_absolute_url())
+                #anchor = fragment_hits[0]['fragment']
+                #frag = Fragment.objects.get(anchor=anchor)
+                return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
             return HttpResponseRedirect(results[0].book.get_absolute_url())
         elif len(results) == 0:
             form = PublishingSuggestForm(initial={"books": query + ", "})
index abf07de..876bf06 100755 (executable)
  * ingenous float containment hack 
  * http://www.mikepadgett.com/technology/technical/alternative-to-the-pie-clearfix-hack/
  */
+/*
 .search-result .book-box-inner {
     height: 1%;
     overflow: hidden;
 }
+*/
 
 .book-mini-box img, .book-box img, .book-wide-box img, .search-result img {
     width: 13.9em;