profiling

[wolnelektury.git] / apps / search / index.py
diff --git a/apps/search/index.py b/apps/search/index.py

index 32cf5f9..26da062 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -18,6 +18,7 @@ import sunburnt
  import custom
  import operator
  
  import custom
  import operator
  
+log = logging.getLogger('search')
  
  class SolrIndex(object):
      def __init__(self, mode=None):
  
  class SolrIndex(object):
      def __init__(self, mode=None):
@@ -111,7 +112,7 @@ class Index(SolrIndex):
      Class indexing books.
      """
      def __init__(self):
      Class indexing books.
      """
      def __init__(self):
-        super(Index, self).__init__()
+        super(Index, self).__init__(mode='rw')
  
      def delete_query(self, *queries):
          """
  
      def delete_query(self, *queries):
          """
@@ -145,6 +146,7 @@ class Index(SolrIndex):
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
+        log.debug("Indexing tags")
          remove_only = kw.get('remove_only', False)
          # first, remove tags from index.
          if tags:
          remove_only = kw.get('remove_only', False)
          # first, remove tags from index.
          if tags:
@@ -202,7 +204,6 @@ class Index(SolrIndex):
                          "uid": "tag%d" % tag.id
                          }
                  self.index.add(doc)
                          "uid": "tag%d" % tag.id
                          }
                  self.index.add(doc)
-                print "%s %s" % (doc['tag_name'], doc['tag_category'])
  
      def create_book_doc(self, book):
          """
  
      def create_book_doc(self, book):
          """
@@ -241,7 +242,7 @@ class Index(SolrIndex):
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
          # let's not index it - it's only used for extracting publish date
          if 'source_name' in meta_fields:
              del meta_fields['source_name']
          # let's not index it - it's only used for extracting publish date
          if 'source_name' in meta_fields:
              del meta_fields['source_name']
@@ -257,6 +258,7 @@ class Index(SolrIndex):
              'authors': meta_fields['authors'],
              'published_date': meta_fields['published_date']
              }
              'authors': meta_fields['authors'],
              'published_date': meta_fields['published_date']
              }
+
          if 'translators' in meta_fields:
              book_fields['translators'] = meta_fields['translators']
  
          if 'translators' in meta_fields:
              book_fields['translators'] = meta_fields['translators']
  
@@ -461,7 +463,6 @@ class Index(SolrIndex):
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
                                         text=u''.join(footnote),
                                         is_footnote=True)
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
                                         text=u''.join(footnote),
                                         is_footnote=True)
-
                          self.index.add(doc)
                          #print "@ footnote text: %s" % footnote
                          footnote = []
                          self.index.add(doc)
                          #print "@ footnote text: %s" % footnote
                          footnote = []
@@ -518,12 +519,13 @@ class Index(SolrIndex):
  
  
  class SearchResult(object):
  
  
  class SearchResult(object):
-    def __init__(self, doc, how_found=None, query=None):
+    def __init__(self, doc, how_found=None, query=None, query_terms=None):
          #        self.search = search
          self.boost = 1.0
          self._hits = []
          self._processed_hits = None  # processed hits
          self.snippets = []
          #        self.search = search
          self.boost = 1.0
          self._hits = []
          self._processed_hits = None  # processed hits
          self.snippets = []
+        self.query_terms = query_terms
  
          if 'score' in doc:
              self._score = doc['score']
  
          if 'score' in doc:
              self._score = doc['score']
@@ -551,7 +553,9 @@ class SearchResult(object):
              hit = (sec + (header_span,), fragment, self._score, {
                  'how_found': how_found,
                  'snippets_pos': snippets_pos,
              hit = (sec + (header_span,), fragment, self._score, {
                  'how_found': how_found,
                  'snippets_pos': snippets_pos,
-                'snippets_revision': snippets_rev
+                'snippets_revision': snippets_rev,
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', [])
                  })
  
              self._hits.append(hit)
                  })
  
              self._hits.append(hit)
@@ -559,7 +563,7 @@ class SearchResult(object):
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
              (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
              (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
-    
+
      def __str__(self):
          return unicode(self).encode('utf-8')
  
      def __str__(self):
          return unicode(self).encode('utf-8')
  
@@ -647,19 +651,25 @@ class SearchResult(object):
              except catalogue.models.Fragment.DoesNotExist:
                  # stale index
                  continue
              except catalogue.models.Fragment.DoesNotExist:
                  # stale index
                  continue
-
              # Figure out if we were searching for a token matching some word in theme name.
              themes = frag.tags.filter(category='theme')
              # Figure out if we were searching for a token matching some word in theme name.
              themes = frag.tags.filter(category='theme')
-            themes_hit = []
-            # if self.searched is not None:
-            #     tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
-            #     for theme in themes:
-            #         name_tokens = self.search.get_tokens(theme.name, 'POLISH')
-            #         for t in tokens:
-            #             if t in name_tokens:
-            #                 if not theme in themes_hit:
-            #                     themes_hit.append(theme)
-            #                 break
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(f[self.OTHER]['themes'])):
+                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(f[self.OTHER]['themes'][i])
+                            break
+
+            def theme_by_name(n):
+                th = filter(lambda t: t.name == n, themes)
+                if th:
+                    return th[0]
+                else:
+                    return None
+            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
  
              m = {'score': f[self.SCORE],
                   'fragment': frag,
  
              m = {'score': f[self.SCORE],
                   'fragment': frag,
@@ -713,7 +723,7 @@ class Search(SolrIndex):
      Search facilities.
      """
      def __init__(self, default_field="text"):
      Search facilities.
      """
      def __init__(self, default_field="text"):
-        super(Search, self).__init__()
+        super(Search, self).__init__(mode='r')
  
      # def get_tokens(self, searched, field='text', cached=None):
      #     """returns tokens analyzed by a proper (for a field) analyzer
  
      # def get_tokens(self, searched, field='text', cached=None):
      #     """returns tokens analyzed by a proper (for a field) analyzer
@@ -784,6 +794,7 @@ class Search(SolrIndex):
          modal - applies to boolean query
          fuzzy - should the query by fuzzy.
          """
          modal - applies to boolean query
          fuzzy - should the query by fuzzy.
          """
+        if query is None: query = ''
          q = self.index.Q()
          q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
                          query.split(r" ")), q)
          q = self.index.Q()
          q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
                          query.split(r" ")), q)
@@ -802,8 +813,7 @@ class Search(SolrIndex):
          return [SearchResult(found, how_found=u'search_phrase') for found in res]
  
      def search_some(self, searched, fields, book=True,
          return [SearchResult(found, how_found=u'search_phrase') for found in res]
  
      def search_some(self, searched, fields, book=True,
-                    filters=None,
-                    snippets=True):
+                    filters=None, snippets=True, query_terms=None):
          assert isinstance(fields, list)
          if filters is None: filters = []
          if book: filters.append(self.index.Q(is_book=True))
          assert isinstance(fields, list)
          if filters is None: filters = []
          if book: filters.append(self.index.Q(is_book=True))
@@ -816,7 +826,7 @@ class Search(SolrIndex):
          query = self.index.query(query)
          query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
          res = query.execute()
          query = self.index.query(query)
          query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
          res = query.execute()
-        return [SearchResult(found, how_found='search_some') for found in res]
+        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
  
      # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
      #     """
  
      # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
      #     """
@@ -891,7 +901,7 @@ class Search(SolrIndex):
  
      #     return books
  
  
      #     return books
  
-    def search_everywhere(self, searched):
+    def search_everywhere(self, searched, query_terms=None):
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
@@ -899,7 +909,6 @@ class Search(SolrIndex):
          """
          books = []
          # content only query : themes x content
          """
          books = []
          # content only query : themes x content
-
          q = self.make_term_query(searched, 'text')
          q_themes = self.make_term_query(searched, 'themes_pl')
  
          q = self.make_term_query(searched, 'text')
          q_themes = self.make_term_query(searched, 'themes_pl')
  
@@ -907,7 +916,7 @@ class Search(SolrIndex):
          res = query.execute()
  
          for found in res:
          res = query.execute()
  
          for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent'))
+            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
  
          # query themes/content x author/title/tags
          in_content = self.index.Q()
  
          # query themes/content x author/title/tags
          in_content = self.index.Q()
@@ -921,8 +930,9 @@ class Search(SolrIndex):
  
          q = in_content & in_meta
          res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
  
          q = in_content & in_meta
          res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
+
          for found in res:
          for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere'))
+            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
  
          return books
  
  
          return books
  
@@ -946,7 +956,6 @@ class Search(SolrIndex):
                      continue
                  text = snippets.get((int(position),
                                       int(length)))
                      continue
                  text = snippets.get((int(position),
                                       int(length)))
-                print "== %s -- %s ==" % (query, text)
                  snip = self.index.highlight(text=text, field=field, q=query)
                  snips[idx] = snip
                  if snip:
                  snip = self.index.highlight(text=text, field=field, q=query)
                  snips[idx] = snip
                  if snip:
@@ -963,6 +972,7 @@ class Search(SolrIndex):
          snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
  
          searchresult.snippets = snips
          snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
  
          searchresult.snippets = snips
+
          return snips
  
      def hint_tags(self, query, pdcounter=True, prefix=True):
          return snips
  
      def hint_tags(self, query, pdcounter=True, prefix=True):