theme hits work!
authorMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Fri, 24 Aug 2012 12:04:09 +0000 (14:04 +0200)
committerMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Fri, 24 Aug 2012 12:04:09 +0000 (14:04 +0200)
apps/search/custom.py
apps/search/index.py
apps/search/views.py

index fcc3bac..33ce47d 100644 (file)
@@ -8,8 +8,6 @@ import copy
 
 
 class TermVectorOptions(search.Options):
 
 
 class TermVectorOptions(search.Options):
-    option_name = "tv"
-
     def __init__(self, schema, original=None):
         self.schema = schema
         if original is None:
     def __init__(self, schema, original=None):
         self.schema = schema
         if original is None:
@@ -30,7 +28,8 @@ class TermVectorOptions(search.Options):
 
     def options(self):
         opts = {}
 
     def options(self):
         opts = {}
-        opts['tv'] = 'true'
+        if self.positions or self.fields:
+            opts['tv'] = 'true'
         if self.positions:
             opts['tv.positions'] = 'true'
         if self.fields:
         if self.positions:
             opts['tv.positions'] = 'true'
         if self.fields:
@@ -72,12 +71,12 @@ def __term_vector(self, positions=False, fields=None):
     newself.term_vectorer.update(positions, fields)
     return newself
 setattr(search.SolrSearch, 'term_vector', __term_vector)
     newself.term_vectorer.update(positions, fields)
     return newself
 setattr(search.SolrSearch, 'term_vector', __term_vector)
-__original__init_common_modules = search.SolrSearch._init_common_modules
 
 
 def __patched__init_common_modules(self):
     __original__init_common_modules(self)
     self.term_vectorer = TermVectorOptions(self.schema)
 
 
 def __patched__init_common_modules(self):
     __original__init_common_modules(self)
     self.term_vectorer = TermVectorOptions(self.schema)
+__original__init_common_modules = search.SolrSearch._init_common_modules
 setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
 
 
 setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
 
 
@@ -118,7 +117,6 @@ class CustomSolrInterface(sunburnt.SolrInterface):
             end = int(wrd.xpath("int[@name='end']")[0].text)
             matches.add((start, end))
 
             end = int(wrd.xpath("int[@name='end']")[0].text)
             matches.add((start, end))
 
-        print matches
         if matches:
             return self.substring(kwargs['text'], matches,
                             margins=kwargs.get('margins', 30),
         if matches:
             return self.substring(kwargs['text'], matches,
                             margins=kwargs.get('margins', 30),
@@ -127,8 +125,8 @@ class CustomSolrInterface(sunburnt.SolrInterface):
             return None
 
     def analyze(self, **kwargs):
             return None
 
     def analyze(self, **kwargs):
-        doc = self._analyze(self, **kwargs)
-        terms = doc.xpath("/lst[@name='index']/arr[last()]/lst/str[1]")
+        doc = self._analyze(**kwargs)
+        terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
         terms = map(lambda n: unicode(n.text), terms)
         return terms
 
         terms = map(lambda n: unicode(n.text), terms)
         return terms
 
@@ -154,3 +152,4 @@ class CustomSolrInterface(sunburnt.SolrInterface):
             snip = snip[:s + off] + mark[0] + snip[s + off:]
             # maybe break on word boundaries
         return snip
             snip = snip[:s + off] + mark[0] + snip[s + off:]
             # maybe break on word boundaries
         return snip
+
index 32cf5f9..4962cae 100644 (file)
@@ -202,7 +202,6 @@ class Index(SolrIndex):
                         "uid": "tag%d" % tag.id
                         }
                 self.index.add(doc)
                         "uid": "tag%d" % tag.id
                         }
                 self.index.add(doc)
-                print "%s %s" % (doc['tag_name'], doc['tag_category'])
 
     def create_book_doc(self, book):
         """
 
     def create_book_doc(self, book):
         """
@@ -518,12 +517,13 @@ class Index(SolrIndex):
 
 
 class SearchResult(object):
 
 
 class SearchResult(object):
-    def __init__(self, doc, how_found=None, query=None):
+    def __init__(self, doc, how_found=None, query=None, query_terms=None):
         #        self.search = search
         self.boost = 1.0
         self._hits = []
         self._processed_hits = None  # processed hits
         self.snippets = []
         #        self.search = search
         self.boost = 1.0
         self._hits = []
         self._processed_hits = None  # processed hits
         self.snippets = []
+        self.query_terms = query_terms
 
         if 'score' in doc:
             self._score = doc['score']
 
         if 'score' in doc:
             self._score = doc['score']
@@ -551,7 +551,9 @@ class SearchResult(object):
             hit = (sec + (header_span,), fragment, self._score, {
                 'how_found': how_found,
                 'snippets_pos': snippets_pos,
             hit = (sec + (header_span,), fragment, self._score, {
                 'how_found': how_found,
                 'snippets_pos': snippets_pos,
-                'snippets_revision': snippets_rev
+                'snippets_revision': snippets_rev,
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', [])
                 })
 
             self._hits.append(hit)
                 })
 
             self._hits.append(hit)
@@ -559,7 +561,7 @@ class SearchResult(object):
     def __unicode__(self):
         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
     def __unicode__(self):
         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
-    
+
     def __str__(self):
         return unicode(self).encode('utf-8')
 
     def __str__(self):
         return unicode(self).encode('utf-8')
 
@@ -647,19 +649,26 @@ class SearchResult(object):
             except catalogue.models.Fragment.DoesNotExist:
                 # stale index
                 continue
             except catalogue.models.Fragment.DoesNotExist:
                 # stale index
                 continue
-
+            print f
             # Figure out if we were searching for a token matching some word in theme name.
             themes = frag.tags.filter(category='theme')
             # Figure out if we were searching for a token matching some word in theme name.
             themes = frag.tags.filter(category='theme')
-            themes_hit = []
-            # if self.searched is not None:
-            #     tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
-            #     for theme in themes:
-            #         name_tokens = self.search.get_tokens(theme.name, 'POLISH')
-            #         for t in tokens:
-            #             if t in name_tokens:
-            #                 if not theme in themes_hit:
-            #                     themes_hit.append(theme)
-            #                 break
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(f[self.OTHER]['themes'])):
+                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(f[self.OTHER]['themes'][i])
+                            break
+
+            def theme_by_name(n):
+                th = filter(lambda t: t.name == n, themes)
+                if th:
+                    return th[0]
+                else:
+                    return None
+            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 
             m = {'score': f[self.SCORE],
                  'fragment': frag,
 
             m = {'score': f[self.SCORE],
                  'fragment': frag,
@@ -802,8 +811,7 @@ class Search(SolrIndex):
         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 
     def search_some(self, searched, fields, book=True,
         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 
     def search_some(self, searched, fields, book=True,
-                    filters=None,
-                    snippets=True):
+                    filters=None, snippets=True, query_terms=None):
         assert isinstance(fields, list)
         if filters is None: filters = []
         if book: filters.append(self.index.Q(is_book=True))
         assert isinstance(fields, list)
         if filters is None: filters = []
         if book: filters.append(self.index.Q(is_book=True))
@@ -816,7 +824,7 @@ class Search(SolrIndex):
         query = self.index.query(query)
         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
         res = query.execute()
         query = self.index.query(query)
         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
         res = query.execute()
-        return [SearchResult(found, how_found='search_some') for found in res]
+        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 
     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
     #     """
 
     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
     #     """
@@ -891,7 +899,7 @@ class Search(SolrIndex):
 
     #     return books
 
 
     #     return books
 
-    def search_everywhere(self, searched):
+    def search_everywhere(self, searched, query_terms=None):
         """
         Tries to use search terms to match different fields of book (or its parts).
         E.g. one word can be an author survey, another be a part of the title, and the rest
         """
         Tries to use search terms to match different fields of book (or its parts).
         E.g. one word can be an author survey, another be a part of the title, and the rest
@@ -899,7 +907,6 @@ class Search(SolrIndex):
         """
         books = []
         # content only query : themes x content
         """
         books = []
         # content only query : themes x content
-
         q = self.make_term_query(searched, 'text')
         q_themes = self.make_term_query(searched, 'themes_pl')
 
         q = self.make_term_query(searched, 'text')
         q_themes = self.make_term_query(searched, 'themes_pl')
 
@@ -907,7 +914,7 @@ class Search(SolrIndex):
         res = query.execute()
 
         for found in res:
         res = query.execute()
 
         for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent'))
+            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 
         # query themes/content x author/title/tags
         in_content = self.index.Q()
 
         # query themes/content x author/title/tags
         in_content = self.index.Q()
@@ -921,8 +928,9 @@ class Search(SolrIndex):
 
         q = in_content & in_meta
         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 
         q = in_content & in_meta
         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
+
         for found in res:
         for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere'))
+            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 
         return books
 
 
         return books
 
index 11ce8e8..9818429 100644 (file)
@@ -116,6 +116,9 @@ def main(request):
             context_instance=RequestContext(request))
     search = Search()
 
             context_instance=RequestContext(request))
     search = Search()
 
+    theme_terms = search.index.analyze(text=query, field="themes_pl") \
+        + search.index.analyze(text=query, field="themes")
+
             # change hints
     tags = search.hint_tags(query, pdcounter=True, prefix=False)
     tags = split_tags(tags)
             # change hints
     tags = search.hint_tags(query, pdcounter=True, prefix=False)
     tags = split_tags(tags)
@@ -125,7 +128,7 @@ def main(request):
 
     # Boost main author/title results with mixed search, and save some of its results for end of list.
     # boost author, title results
 
     # Boost main author/title results with mixed search, and save some of its results for end of list.
     # boost author, title results
-    author_title_mixed = search.search_some(query, ['authors', 'title', 'tags'])
+    author_title_mixed = search.search_some(query, ['authors', 'title', 'tags'], query_terms=theme_terms)
     author_title_rest = []
 
     for b in author_title_mixed:
     author_title_rest = []
 
     for b in author_title_mixed:
@@ -139,9 +142,9 @@ def main(request):
     # Because the query is using only one field.
     text_phrase = SearchResult.aggregate(
         search.search_phrase(query, 'text', snippets=True, book=False),
     # Because the query is using only one field.
     text_phrase = SearchResult.aggregate(
         search.search_phrase(query, 'text', snippets=True, book=False),
-        search.search_some(query, ['text'], snippets=True, book=False))
+        search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms))
 
 
-    everywhere = search.search_everywhere(query)
+    everywhere = search.search_everywhere(query, query_terms=theme_terms)
 
     def already_found(results):
         def f(e):
 
     def already_found(results):
         def f(e):