reading state api

[wolnelektury.git] / src / search / index.py
diff --git a/src/search/index.py b/src/search/index.py

index 4c278ea..d3377b1 100644 (file)
--- a/src/search/index.py
+++ b/src/search/index.py
@@ -10,6 +10,7 @@ from librarian import dcparser
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
+import picture.models
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from itertools import chain
  import sunburnt
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from itertools import chain
  import sunburnt
@@ -20,6 +21,13 @@ from wolnelektury.utils import makedirs
  
  log = logging.getLogger('search')
  
  
  log = logging.getLogger('search')
  
+if os.path.isfile(settings.SOLR_STOPWORDS):
+    stopwords = set(
+        line.decode('utf-8').strip()
+        for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
+else:
+    stopwords = set()
+
  
  class SolrIndex(object):
      def __init__(self, mode=None):
  
  class SolrIndex(object):
      def __init__(self, mode=None):
@@ -297,7 +305,6 @@ class Index(SolrIndex):
              book_info = dcparser.parse(open(book.xml_file.path))
  
          fields['slug'] = book.slug
              book_info = dcparser.parse(open(book.xml_file.path))
  
          fields['slug'] = book.slug
-        fields['tags'] = [t.name for t in book.tags]
          fields['is_book'] = True
  
          # validator, name
          fields['is_book'] = True
  
          # validator, name
@@ -508,6 +515,47 @@ class Index(SolrIndex):
          finally:
              snippets.close()
  
          finally:
              snippets.close()
  
+    def remove_picture(self, picture_or_id):
+        """Removes a picture from search index."""
+        if isinstance(picture_or_id, picture.models.Picture):
+            picture_id = picture_or_id.id
+        else:
+            picture_id = picture_or_id
+        self.delete_query(self.index.Q(picture_id=picture_id))
+
+    def index_picture(self, picture, picture_info=None, overwrite=True):
+        """
+        Indexes the picture.
+        Creates a lucene document for extracted metadata
+        and calls self.index_area() to index the contents of the picture.
+        """
+        if overwrite:
+            # we don't remove snippets, since they might be still needed by
+            # threads using not reopened index
+            self.remove_picture(picture)
+
+        picture_doc = {'picture_id': int(picture.id)}
+        meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
+            'authors', 'title', 'epochs', 'kinds', 'genres'])
+
+        picture_doc.update(meta_fields)
+
+        picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
+        self.index.add(picture_doc)
+        del picture_doc['is_book']
+        for area in picture.areas.all():
+            self.index_area(area, picture_fields=picture_doc)
+
+    def index_area(self, area, picture_fields):
+        """
+        Indexes themes and objects on the area.
+        """
+        doc = dict(picture_fields)
+        doc['area_id'] = area.id
+        doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
+        doc['uid'] = 'area%s' % area.id
+        self.index.add(doc)
+
  
  class SearchResult(object):
      def __init__(self, doc, how_found=None, query_terms=None):
  
  class SearchResult(object):
      def __init__(self, doc, how_found=None, query_terms=None):
@@ -551,6 +599,17 @@ class SearchResult(object):
  
              self._hits.append(hit)
  
  
              self._hits.append(hit)
  
+    @classmethod
+    def from_book(cls, book, how_found=None, query_terms=None):
+        doc = {
+            'score': book.popularity.count,
+            'book_id': book.id,
+            'published_date': 0,
+        }
+        result = cls(doc, how_found=how_found, query_terms=query_terms)
+        result._book = book
+        return result
+
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
              (self.book_id, len(self._hits),
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
              (self.book_id, len(self._hits),
@@ -566,10 +625,9 @@ class SearchResult(object):
  
      def merge(self, other):
          if self.book_id != other.book_id:
  
      def merge(self, other):
          if self.book_id != other.book_id:
-            raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
+            raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
          self._hits += other._hits
          self._hits += other._hits
-        if other.score > self.score:
-            self._score = other._score
+        self._score += max(other._score, 0)
          return self
  
      def get_book(self):
          return self
  
      def get_book(self):
@@ -708,6 +766,110 @@ class SearchResult(object):
              return None
  
  
              return None
  
  
+class PictureResult(object):
+    def __init__(self, doc, how_found=None, query_terms=None):
+        self.boost = 1.0
+        self.query_terms = query_terms
+        self._picture = None
+        self._hits = []
+        self._processed_hits = None
+
+        if 'score' in doc:
+            self._score = doc['score']
+        else:
+            self._score = 0
+
+        self.picture_id = int(doc["picture_id"])
+
+        if doc.get('area_id'):
+            hit = (self._score, {
+                'how_found': how_found,
+                'area_id': doc['area_id'],
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', []),
+            })
+
+            self._hits.append(hit)
+
+    def __unicode__(self):
+        return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
+
+    def __repr__(self):
+        return unicode(self)
+
+    @property
+    def score(self):
+        return self._score * self.boost
+
+    def merge(self, other):
+        if self.picture_id != other.picture_id:
+            raise ValueError(
+                "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
+        self._hits += other._hits
+        self._score += max(other._score, 0)
+        return self
+
+    SCORE = 0
+    OTHER = 1
+
+    @property
+    def hits(self):
+        if self._processed_hits is not None:
+            return self._processed_hits
+
+        hits = []
+        for hit in self._hits:
+            try:
+                area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
+            except picture.models.PictureArea.DoesNotExist:
+                # stale index
+                continue
+            # Figure out if we were searching for a token matching some word in theme name.
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(hit[self.OTHER]['themes'])):
+                    tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(hit[self.OTHER]['themes'][i])
+                            break
+
+            m = {
+                'score': hit[self.SCORE],
+                'area': area,
+                'themes_hit': themes_hit,
+            }
+            m.update(hit[self.OTHER])
+            hits.append(m)
+
+        hits.sort(key=lambda h: h['score'], reverse=True)
+        hits = hits[:1]
+        self._processed_hits = hits
+        return hits
+
+    def get_picture(self):
+        if self._picture is None:
+            self._picture = picture.models.Picture.objects.get(id=self.picture_id)
+        return self._picture
+
+    picture = property(get_picture)
+
+    @staticmethod
+    def aggregate(*result_lists):
+        books = {}
+        for rl in result_lists:
+            for r in rl:
+                if r.picture_id in books:
+                    books[r.picture_id].merge(r)
+                else:
+                    books[r.picture_id] = r
+        return books.values()
+
+    def __cmp__(self, other):
+        return cmp(self.score, other.score)
+
+
  class Search(SolrIndex):
      """
      Search facilities.
  class Search(SolrIndex):
      """
      Search facilities.
@@ -728,23 +890,51 @@ class Search(SolrIndex):
  
          return q
  
  
          return q
  
-    def search_words(self, words, fields, book=True):
+    def search_by_author(self, words):
+        from catalogue.models import Book
+        books = Book.objects.filter(parent=None).order_by('-popularity__count')
+        for word in words:
+            books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
+        return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
+
+    def search_words(self, words, fields, required=None, book=True, picture=False):
+        if book and not picture and fields == ['authors']:
+            return self.search_by_author(words)
          filters = []
          for word in words:
          filters = []
          for word in words:
-            word_filter = None
-            for field in fields:
-                q = self.index.Q(**{field: word})
-                if word_filter is None:
-                    word_filter = q
-                else:
-                    word_filter |= q
-            filters.append(word_filter)
+            if book or picture or (word not in stopwords):
+                word_filter = None
+                for field in fields:
+                    q = self.index.Q(**{field: word})
+                    if word_filter is None:
+                        word_filter = q
+                    else:
+                        word_filter |= q
+                filters.append(word_filter)
+        if required:
+            required_filter = None
+            for field in required:
+                for word in words:
+                    if book or picture or (word not in stopwords):
+                        q = self.index.Q(**{field: word})
+                        if required_filter is None:
+                            required_filter = q
+                        else:
+                            required_filter |= q
+            filters.append(required_filter)
+        if not filters:
+            return []
+        params = {}
          if book:
          if book:
-            query = self.index.query(is_book=True)
+            params['is_book'] = True
+        if picture:
+            params['picture_id__gt'] = 0
          else:
          else:
-            query = self.index.query()
+            params['book_id__gt'] = 0
+        query = self.index.query(**params)
          query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
          query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
-        return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
+        result_class = PictureResult if picture else SearchResult
+        return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
  
      def get_snippets(self, searchresult, query, field='text', num=1):
          """
  
      def get_snippets(self, searchresult, query, field='text', num=1):
          """
@@ -767,6 +957,8 @@ class Search(SolrIndex):
                  text = snippets.get((int(position),
                                       int(length)))
                  snip = self.index.highlight(text=text, field=field, q=query)
                  text = snippets.get((int(position),
                                       int(length)))
                  snip = self.index.highlight(text=text, field=field, q=query)
+                if not snip and field == 'text':
+                    snip = self.index.highlight(text=text, field='text_nonstem', q=query)
                  if snip not in snips:
                      snips[idx] = snip
                      if snip:
                  if snip not in snips:
                      snips[idx] = snip
                      if snip: