GzipPipelineCachedStorage from fnpdjango.

[wolnelektury.git] / apps / search / index.py
diff --git a/apps/search/index.py b/apps/search/index.py

index 32cf5f9..7fb60b5 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -1,5 +1,7 @@
  # -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
  from django.conf import settings
  
  import os
@@ -18,6 +20,7 @@ import sunburnt
  import custom
  import operator
  
+log = logging.getLogger('search')
  
  class SolrIndex(object):
      def __init__(self, mode=None):
@@ -92,7 +95,8 @@ class Snippets(object):
  
      def close(self):
          """Close snippet file"""
-        self.file.close()
+        if self.file:
+            self.file.close()
  
      def remove(self):
          self.revision = None
@@ -111,7 +115,7 @@ class Index(SolrIndex):
      Class indexing books.
      """
      def __init__(self):
-        super(Index, self).__init__()
+        super(Index, self).__init__(mode='rw')
  
      def delete_query(self, *queries):
          """
@@ -132,7 +136,6 @@ class Index(SolrIndex):
                  for res in ids:
                      uids.add(res['uid'])
                  st += rows
-                #        print "Will delete %s" % ','.join([x for x in uids])
          if uids:
              self.index.delete(uids)
              return True
@@ -145,6 +148,7 @@ class Index(SolrIndex):
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
+        log.debug("Indexing tags")
          remove_only = kw.get('remove_only', False)
          # first, remove tags from index.
          if tags:
@@ -161,7 +165,7 @@ class Index(SolrIndex):
  
                  q_id_cat = self.index.Q(q_id & q_cat)
                  tag_qs.append(q_id_cat)
-            self.delete_query(tag_qs)
+            self.delete_query(*tag_qs)
          else:  # all
              q = self.index.Q(tag_id__any=True)
              self.delete_query(q)
@@ -202,7 +206,6 @@ class Index(SolrIndex):
                          "uid": "tag%d" % tag.id
                          }
                  self.index.add(doc)
-                print "%s %s" % (doc['tag_name'], doc['tag_category'])
  
      def create_book_doc(self, book):
          """
@@ -241,7 +244,7 @@ class Index(SolrIndex):
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
          # let's not index it - it's only used for extracting publish date
          if 'source_name' in meta_fields:
              del meta_fields['source_name']
@@ -257,6 +260,7 @@ class Index(SolrIndex):
              'authors': meta_fields['authors'],
              'published_date': meta_fields['published_date']
              }
+
          if 'translators' in meta_fields:
              book_fields['translators'] = meta_fields['translators']
  
@@ -461,9 +465,7 @@ class Index(SolrIndex):
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
                                         text=u''.join(footnote),
                                         is_footnote=True)
-
                          self.index.add(doc)
-                        #print "@ footnote text: %s" % footnote
                          footnote = []
  
                      # handle fragments and themes.
@@ -496,7 +498,6 @@ class Index(SolrIndex):
                                         fragment_anchor=fid,
                                         text=fix_format(frag['text']),
                                         themes=frag['themes'])
-                        #print '@ FRAG %s' % frag['content']
                          self.index.add(doc)
  
                          # Collect content.
@@ -509,7 +510,6 @@ class Index(SolrIndex):
                          # in the end, add a section text.
                  doc = add_part(snippets, header_index=position,
                                 header_type=header.tag, text=fix_format(content))
-                #print '@ CONTENT: %s' % fix_format(content)
  
                  self.index.add(doc)
  
@@ -518,12 +518,13 @@ class Index(SolrIndex):
  
  
  class SearchResult(object):
-    def __init__(self, doc, how_found=None, query=None):
+    def __init__(self, doc, how_found=None, query=None, query_terms=None):
          #        self.search = search
          self.boost = 1.0
          self._hits = []
          self._processed_hits = None  # processed hits
          self.snippets = []
+        self.query_terms = query_terms
  
          if 'score' in doc:
              self._score = doc['score']
@@ -546,20 +547,22 @@ class SearchResult(object):
              header_span = header_span is not None and int(header_span) or 1
              fragment = doc.get("fragment_anchor", None)
              snippets_pos = (doc['snippets_position'], doc['snippets_length'])
-            snippets_rev = doc['snippets_revision']
+            snippets_rev = doc.get('snippets_revision', None)
  
              hit = (sec + (header_span,), fragment, self._score, {
                  'how_found': how_found,
                  'snippets_pos': snippets_pos,
-                'snippets_revision': snippets_rev
+                'snippets_revision': snippets_rev,
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', [])
                  })
  
              self._hits.append(hit)
  
      def __unicode__(self):
-        return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
+        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
              (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
-    
+
      def __str__(self):
          return unicode(self).encode('utf-8')
  
@@ -647,19 +650,25 @@ class SearchResult(object):
              except catalogue.models.Fragment.DoesNotExist:
                  # stale index
                  continue
-
              # Figure out if we were searching for a token matching some word in theme name.
              themes = frag.tags.filter(category='theme')
-            themes_hit = []
-            # if self.searched is not None:
-            #     tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
-            #     for theme in themes:
-            #         name_tokens = self.search.get_tokens(theme.name, 'POLISH')
-            #         for t in tokens:
-            #             if t in name_tokens:
-            #                 if not theme in themes_hit:
-            #                     themes_hit.append(theme)
-            #                 break
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(f[self.OTHER]['themes'])):
+                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(f[self.OTHER]['themes'][i])
+                            break
+
+            def theme_by_name(n):
+                th = filter(lambda t: t.name == n, themes)
+                if th:
+                    return th[0]
+                else:
+                    return None
+            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
  
              m = {'score': f[self.SCORE],
                   'fragment': frag,
@@ -713,70 +722,8 @@ class Search(SolrIndex):
      Search facilities.
      """
      def __init__(self, default_field="text"):
-        super(Search, self).__init__()
+        super(Search, self).__init__(mode='r')
  
-    # def get_tokens(self, searched, field='text', cached=None):
-    #     """returns tokens analyzed by a proper (for a field) analyzer
-    #     argument can be: StringReader, string/unicode, or tokens. In the last case
-    #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
-    #     """
-    #     if cached is not None and field in cached:
-    #         return cached[field]
-
-    #     if isinstance(searched, str) or isinstance(searched, unicode):
-    #         searched = StringReader(searched)
-    #     elif isinstance(searched, list):
-    #         return searched
-
-    #     searched.reset()
-    #     tokens = self.analyzer.reusableTokenStream(field, searched)
-    #     toks = []
-    #     while tokens.incrementToken():
-    #         cta = tokens.getAttribute(CharTermAttribute.class_)
-    #         toks.append(cta.toString())
-
-    #     if cached is not None:
-    #         cached[field] = toks
-
-    #     return toks
-
-    # @staticmethod
-    # def fuzziness(fuzzy):
-    #     """Helper method to sanitize fuzziness"""
-    #     if not fuzzy:
-    #         return None
-    #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
-    #         return fuzzy
-    #     else:
-    #         return 0.5
-
-    # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
-    #     """
-    #     Return a PhraseQuery with a series of tokens.
-    #     """
-    #     if fuzzy:
-    #         phrase = MultiPhraseQuery()
-    #         for t in tokens:
-    #             term = Term(field, t)
-    #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
-    #             fuzzterms = []
-
-    #             while True:
-    #                 ft = fuzzterm.term()
-    #                 if ft:
-    #                     fuzzterms.append(ft)
-    #                 if not fuzzterm.next(): break
-    #             if fuzzterms:
-    #                 phrase.add(JArray('object')(fuzzterms, Term))
-    #             else:
-    #                 phrase.add(term)
-    #     else:
-    #         phrase = PhraseQuery()
-    #         phrase.setSlop(slop)
-    #         for t in tokens:
-    #             term = Term(field, t)
-    #             phrase.add(term)
-    #     return phrase
  
      def make_term_query(self, query, field='text', modal=operator.or_):
          """
@@ -784,6 +731,7 @@ class Search(SolrIndex):
          modal - applies to boolean query
          fuzzy - should the query by fuzzy.
          """
+        if query is None: query = ''
          q = self.index.Q()
          q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
                          query.split(r" ")), q)
@@ -802,8 +750,7 @@ class Search(SolrIndex):
          return [SearchResult(found, how_found=u'search_phrase') for found in res]
  
      def search_some(self, searched, fields, book=True,
-                    filters=None,
-                    snippets=True):
+                    filters=None, snippets=True, query_terms=None):
          assert isinstance(fields, list)
          if filters is None: filters = []
          if book: filters.append(self.index.Q(is_book=True))
@@ -816,82 +763,10 @@ class Search(SolrIndex):
          query = self.index.query(query)
          query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
          res = query.execute()
-        return [SearchResult(found, how_found='search_some') for found in res]
-
-    # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     """
-    #     Search for perfect book matches. Just see if the query matches with some author or title,
-    #     taking hints into account.
-    #     """
-    #     fields_to_search = ['authors', 'title']
-    #     only_in = None
-    #     if hint:
-    #         if not hint.should_search_for_book():
-    #             return []
-    #         fields_to_search = hint.just_search_in(fields_to_search)
-    #         only_in = hint.book_filter()
-
-    #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
-    #     books = []
-    #     for q in qrys:
-    #         top = self.searcher.search(q,
-    #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-    #             max_results)
-    #         for found in top.scoreDocs:
-    #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
-    #     return books
-
-    # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     fields_to_search = ['tags', 'authors', 'title']
-
-    #     only_in = None
-    #     if hint:
-    #         if not hint.should_search_for_book():
-    #             return []
-    #         fields_to_search = hint.just_search_in(fields_to_search)
-    #         only_in = hint.book_filter()
-
-    #     tokens = self.get_tokens(searched, field='SIMPLE')
-
-    #     q = BooleanQuery()
-
-    #     for fld in fields_to_search:
-    #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
-    #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-    #     books = []
-    #     top = self.searcher.search(q,
-    #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-    #         max_results)
-    #     for found in top.scoreDocs:
-    #         books.append(SearchResult(self, found, how_found="search_book"))
-
-    #     return books
-
-    # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     """
-    #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
-    #     some part/fragment of the book.
-    #     """
-    #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
+        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
  
-    #     flt = None
-    #     if hint:
-    #         flt = hint.part_filter()
  
-    #     books = []
-    #     for q in qrys:
-    #         top = self.searcher.search(q,
-    #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
-    #                                                        flt]),
-    #                                    max_results)
-    #         for found in top.scoreDocs:
-    #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
-    #     return books
-
-    def search_everywhere(self, searched):
+    def search_everywhere(self, searched, query_terms=None):
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
@@ -899,7 +774,6 @@ class Search(SolrIndex):
          """
          books = []
          # content only query : themes x content
-
          q = self.make_term_query(searched, 'text')
          q_themes = self.make_term_query(searched, 'themes_pl')
  
@@ -907,7 +781,7 @@ class Search(SolrIndex):
          res = query.execute()
  
          for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent'))
+            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
  
          # query themes/content x author/title/tags
          in_content = self.index.Q()
@@ -921,8 +795,9 @@ class Search(SolrIndex):
  
          q = in_content & in_meta
          res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
+
          for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere'))
+            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
  
          return books
  
@@ -946,7 +821,6 @@ class Search(SolrIndex):
                      continue
                  text = snippets.get((int(position),
                                       int(length)))
-                print "== %s -- %s ==" % (query, text)
                  snip = self.index.highlight(text=text, field=field, q=query)
                  snips[idx] = snip
                  if snip:
@@ -954,7 +828,7 @@ class Search(SolrIndex):
                  idx += 1
  
          except IOError, e:
-            log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+            log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
              return []
          finally:
              snippets.close()
@@ -963,6 +837,7 @@ class Search(SolrIndex):
          snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
  
          searchresult.snippets = snips
+
          return snips
  
      def hint_tags(self, query, pdcounter=True, prefix=True):
@@ -991,6 +866,8 @@ class Search(SolrIndex):
          res = self.apply_filters(query, filters).execute()
  
          tags = []
+        pd_tags = []
+
          for doc in res:
              is_pdcounter = doc.get('is_pdcounter', False)
              category = doc.get('tag_category')
@@ -1002,17 +879,19 @@ class Search(SolrIndex):
                          tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
                          tag.category = 'pd_book'  # make it look more lik a tag.
                      else:
-                        print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
+                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+                    pd_tags.append(tag)
                  else:
                      tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-                    # don't add the pdcounter tag if same tag already exists
-
-                tags.append(tag)
+                    tags.append(tag)
  
              except catalogue.models.Tag.DoesNotExist: pass
              except PDCounterAuthor.DoesNotExist: pass
              except PDCounterBook.DoesNotExist: pass
  
+        tags_slugs = set(map(lambda t: t.slug, tags))
+        tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
+
          log.debug('search_tags: %s' % tags)
  
          return tags
@@ -1038,40 +917,18 @@ class Search(SolrIndex):
          Searches for Book objects using query
          """
          bks = []
+        bks_found = set()
+        query = query.query(is_book=True)
          res = self.apply_filters(query, filters).field_limit(['book_id'])
          for r in res:
              try:
-                bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
+                bid = r['book_id']
+                if not bid in bks_found:
+                    bks.append(catalogue.models.Book.objects.get(id=bid))
+                    bks_found.add(bid)
              except catalogue.models.Book.DoesNotExist: pass
          return bks
   
-    # def make_prefix_phrase(self, toks, field):
-    #     q = MultiPhraseQuery()
-    #     for i in range(len(toks)):
-    #         t = Term(field, toks[i])
-    #         if i == len(toks) - 1:
-    #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
-    #             if pterms:
-    #                 q.add(pterms)
-    #             else:
-    #                 q.add(t)
-    #         else:
-    #             q.add(t)
-    #     return q
-
-    # @staticmethod
-    # def term_filter(term, inverse=False):
-    #     only_term = TermsFilter()
-    #     only_term.addTerm(term)
-
-    #     if inverse:
-    #         neg = BooleanFilter()
-    #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
-    #         only_term = neg
-
-    #     return only_term
-
-
  
      @staticmethod
      def apply_filters(query, filters):
@@ -1083,15 +940,3 @@ class Search(SolrIndex):
          for f in filters:
              query = query.query(f)
          return query
-
-    # def filtered_categories(self, tags):
-    #     """
-    #     Return a list of tag categories, present in tags list.
-    #     """
-    #     cats = {}
-    #     for t in tags:
-    #         cats[t.category] = True
-    #     return cats.keys()
-
-    # def hint(self):
-    #     return Hint(self)