You forgot XML.

[wolnelektury.git] / src / search / index.py
diff --git a/src/search/index.py b/src/search/index.py

index 7bc61c5..d3377b1 100644 (file)
--- a/src/search/index.py
+++ b/src/search/index.py
@@ -10,6 +10,7 @@ from librarian import dcparser
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
+import picture.models
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from itertools import chain
  import sunburnt
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from itertools import chain
  import sunburnt
@@ -20,6 +21,13 @@ from wolnelektury.utils import makedirs
  
  log = logging.getLogger('search')
  
  
  log = logging.getLogger('search')
  
+if os.path.isfile(settings.SOLR_STOPWORDS):
+    stopwords = set(
+        line.decode('utf-8').strip()
+        for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
+else:
+    stopwords = set()
+
  
  class SolrIndex(object):
      def __init__(self, mode=None):
  
  class SolrIndex(object):
      def __init__(self, mode=None):
@@ -271,14 +279,14 @@ class Index(SolrIndex):
          'dramat_wierszowany_lp',
          'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
          'wywiad',
          'dramat_wierszowany_lp',
          'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
          'wywiad',
-        ]
+    ]
  
      ignore_content_tags = [
  
      ignore_content_tags = [
-        'uwaga', 'extra', 'nota_red',
+        'uwaga', 'extra', 'nota_red', 'abstrakt',
          'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
          'didaskalia',
          'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
          'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
          'didaskalia',
          'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
-        ]
+    ]
  
      footnote_tags = ['pa', 'pt', 'pr', 'pe']
  
  
      footnote_tags = ['pa', 'pt', 'pr', 'pe']
  
@@ -297,7 +305,6 @@ class Index(SolrIndex):
              book_info = dcparser.parse(open(book.xml_file.path))
  
          fields['slug'] = book.slug
              book_info = dcparser.parse(open(book.xml_file.path))
  
          fields['slug'] = book.slug
-        fields['tags'] = [t.name for t in book.tags]
          fields['is_book'] = True
  
          # validator, name
          fields['is_book'] = True
  
          # validator, name
@@ -508,6 +515,47 @@ class Index(SolrIndex):
          finally:
              snippets.close()
  
          finally:
              snippets.close()
  
+    def remove_picture(self, picture_or_id):
+        """Removes a picture from search index."""
+        if isinstance(picture_or_id, picture.models.Picture):
+            picture_id = picture_or_id.id
+        else:
+            picture_id = picture_or_id
+        self.delete_query(self.index.Q(picture_id=picture_id))
+
+    def index_picture(self, picture, picture_info=None, overwrite=True):
+        """
+        Indexes the picture.
+        Creates a lucene document for extracted metadata
+        and calls self.index_area() to index the contents of the picture.
+        """
+        if overwrite:
+            # we don't remove snippets, since they might be still needed by
+            # threads using not reopened index
+            self.remove_picture(picture)
+
+        picture_doc = {'picture_id': int(picture.id)}
+        meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
+            'authors', 'title', 'epochs', 'kinds', 'genres'])
+
+        picture_doc.update(meta_fields)
+
+        picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
+        self.index.add(picture_doc)
+        del picture_doc['is_book']
+        for area in picture.areas.all():
+            self.index_area(area, picture_fields=picture_doc)
+
+    def index_area(self, area, picture_fields):
+        """
+        Indexes themes and objects on the area.
+        """
+        doc = dict(picture_fields)
+        doc['area_id'] = area.id
+        doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
+        doc['uid'] = 'area%s' % area.id
+        self.index.add(doc)
+
  
  class SearchResult(object):
      def __init__(self, doc, how_found=None, query_terms=None):
  
  class SearchResult(object):
      def __init__(self, doc, how_found=None, query_terms=None):
@@ -551,6 +599,17 @@ class SearchResult(object):
  
              self._hits.append(hit)
  
  
              self._hits.append(hit)
  
+    @classmethod
+    def from_book(cls, book, how_found=None, query_terms=None):
+        doc = {
+            'score': book.popularity.count,
+            'book_id': book.id,
+            'published_date': 0,
+        }
+        result = cls(doc, how_found=how_found, query_terms=query_terms)
+        result._book = book
+        return result
+
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
              (self.book_id, len(self._hits),
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
              (self.book_id, len(self._hits),
@@ -566,10 +625,9 @@ class SearchResult(object):
  
      def merge(self, other):
          if self.book_id != other.book_id:
  
      def merge(self, other):
          if self.book_id != other.book_id:
-            raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
+            raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
          self._hits += other._hits
          self._hits += other._hits
-        if other.score > self.score:
-            self._score = other._score
+        self._score += max(other._score, 0)
          return self
  
      def get_book(self):
          return self
  
      def get_book(self):
@@ -670,7 +728,7 @@ class SearchResult(object):
              m.update(f[self.OTHER])
              hits.append(m)
  
              m.update(f[self.OTHER])
              hits.append(m)
  
-        hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
+        hits.sort(key=lambda h: h['score'], reverse=True)
  
          self._processed_hits = hits
  
  
          self._processed_hits = hits
  
@@ -708,6 +766,110 @@ class SearchResult(object):
              return None
  
  
              return None
  
  
+class PictureResult(object):
+    def __init__(self, doc, how_found=None, query_terms=None):
+        self.boost = 1.0
+        self.query_terms = query_terms
+        self._picture = None
+        self._hits = []
+        self._processed_hits = None
+
+        if 'score' in doc:
+            self._score = doc['score']
+        else:
+            self._score = 0
+
+        self.picture_id = int(doc["picture_id"])
+
+        if doc.get('area_id'):
+            hit = (self._score, {
+                'how_found': how_found,
+                'area_id': doc['area_id'],
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', []),
+            })
+
+            self._hits.append(hit)
+
+    def __unicode__(self):
+        return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
+
+    def __repr__(self):
+        return unicode(self)
+
+    @property
+    def score(self):
+        return self._score * self.boost
+
+    def merge(self, other):
+        if self.picture_id != other.picture_id:
+            raise ValueError(
+                "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
+        self._hits += other._hits
+        self._score += max(other._score, 0)
+        return self
+
+    SCORE = 0
+    OTHER = 1
+
+    @property
+    def hits(self):
+        if self._processed_hits is not None:
+            return self._processed_hits
+
+        hits = []
+        for hit in self._hits:
+            try:
+                area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
+            except picture.models.PictureArea.DoesNotExist:
+                # stale index
+                continue
+            # Figure out if we were searching for a token matching some word in theme name.
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(hit[self.OTHER]['themes'])):
+                    tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(hit[self.OTHER]['themes'][i])
+                            break
+
+            m = {
+                'score': hit[self.SCORE],
+                'area': area,
+                'themes_hit': themes_hit,
+            }
+            m.update(hit[self.OTHER])
+            hits.append(m)
+
+        hits.sort(key=lambda h: h['score'], reverse=True)
+        hits = hits[:1]
+        self._processed_hits = hits
+        return hits
+
+    def get_picture(self):
+        if self._picture is None:
+            self._picture = picture.models.Picture.objects.get(id=self.picture_id)
+        return self._picture
+
+    picture = property(get_picture)
+
+    @staticmethod
+    def aggregate(*result_lists):
+        books = {}
+        for rl in result_lists:
+            for r in rl:
+                if r.picture_id in books:
+                    books[r.picture_id].merge(r)
+                else:
+                    books[r.picture_id] = r
+        return books.values()
+
+    def __cmp__(self, other):
+        return cmp(self.score, other.score)
+
+
  class Search(SolrIndex):
      """
      Search facilities.
  class Search(SolrIndex):
      """
      Search facilities.
@@ -728,23 +890,51 @@ class Search(SolrIndex):
  
          return q
  
  
          return q
  
-    def search_words(self, words, fields, book=True):
+    def search_by_author(self, words):
+        from catalogue.models import Book
+        books = Book.objects.filter(parent=None).order_by('-popularity__count')
+        for word in words:
+            books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
+        return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
+
+    def search_words(self, words, fields, required=None, book=True, picture=False):
+        if book and not picture and fields == ['authors']:
+            return self.search_by_author(words)
          filters = []
          for word in words:
          filters = []
          for word in words:
-            word_filter = None
-            for field in fields:
-                q = self.index.Q(**{field: word})
-                if word_filter is None:
-                    word_filter = q
-                else:
-                    word_filter |= q
-            filters.append(word_filter)
+            if book or picture or (word not in stopwords):
+                word_filter = None
+                for field in fields:
+                    q = self.index.Q(**{field: word})
+                    if word_filter is None:
+                        word_filter = q
+                    else:
+                        word_filter |= q
+                filters.append(word_filter)
+        if required:
+            required_filter = None
+            for field in required:
+                for word in words:
+                    if book or picture or (word not in stopwords):
+                        q = self.index.Q(**{field: word})
+                        if required_filter is None:
+                            required_filter = q
+                        else:
+                            required_filter |= q
+            filters.append(required_filter)
+        if not filters:
+            return []
+        params = {}
          if book:
          if book:
-            query = self.index.query(is_book=True)
+            params['is_book'] = True
+        if picture:
+            params['picture_id__gt'] = 0
          else:
          else:
-            query = self.index.query()
+            params['book_id__gt'] = 0
+        query = self.index.query(**params)
          query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
          query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
-        return [SearchResult(found, how_found='search_words') for found in query.execute()]
+        result_class = PictureResult if picture else SearchResult
+        return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
  
      def get_snippets(self, searchresult, query, field='text', num=1):
          """
  
      def get_snippets(self, searchresult, query, field='text', num=1):
          """
@@ -767,6 +957,8 @@ class Search(SolrIndex):
                  text = snippets.get((int(position),
                                       int(length)))
                  snip = self.index.highlight(text=text, field=field, q=query)
                  text = snippets.get((int(position),
                                       int(length)))
                  snip = self.index.highlight(text=text, field=field, q=query)
+                if not snip and field == 'text':
+                    snip = self.index.highlight(text=text, field='text_nonstem', q=query)
                  if snip not in snips:
                      snips[idx] = snip
                      if snip:
                  if snip not in snips:
                      snips[idx] = snip
                      if snip:
@@ -790,104 +982,6 @@ class Search(SolrIndex):
  
          return snips
  
  
          return snips
  
-    def hint_tags(self, query, pdcounter=True, prefix=True):
-        """
-        Return auto-complete hints for tags
-        using prefix search.
-        """
-        q = self.index.Q()
-        query = query.strip()
-        for field in ['tag_name', 'tag_name_pl']:
-            if prefix:
-                q |= self.index.Q(**{field: query + "*"})
-            else:
-                q |= self.make_term_query(query, field=field)
-        qu = self.index.query(q)
-
-        return self.search_tags(qu, pdcounter=pdcounter)
-
-    def search_tags(self, query, filters=None, pdcounter=False):
-        """
-        Search for Tag objects using query.
-        """
-        if not filters:
-            filters = []
-        if not pdcounter:
-            filters.append(~self.index.Q(is_pdcounter=True))
-        res = self.apply_filters(query, filters).execute()
-
-        tags = []
-        pd_tags = []
-
-        for doc in res:
-            is_pdcounter = doc.get('is_pdcounter', False)
-            category = doc.get('tag_category')
-            try:
-                if is_pdcounter:
-                    if category == 'pd_author':
-                        tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
-                    elif category == 'pd_book':
-                        tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
-                        tag.category = 'pd_book'  # make it look more lik a tag.
-                    else:
-                        # WTF
-                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
-                            int(doc.get('tag_id')), category)).encode('utf-8')
-                    pd_tags.append(tag)
-                else:
-                    tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-                    tags.append(tag)
-
-            except catalogue.models.Tag.DoesNotExist:
-                pass
-            except PDCounterAuthor.DoesNotExist:
-                pass
-            except PDCounterBook.DoesNotExist:
-                pass
-
-        tags_slugs = set(map(lambda t: t.slug, tags))
-        tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
-
-        log.debug('search_tags: %s' % tags)
-
-        return tags
-
-    def hint_books(self, query, prefix=True):
-        """
-        Returns auto-complete hints for book titles
-        Because we do not index 'pseudo' title-tags.
-        Prefix search.
-        """
-        q = self.index.Q()
-        query = query.strip()
-        if prefix:
-            q |= self.index.Q(title=query + "*")
-            q |= self.index.Q(title_orig=query + "*")
-        else:
-            q |= self.make_term_query(query, field='title')
-            q |= self.make_term_query(query, field='title_orig')
-        qu = self.index.query(q)
-        only_books = self.index.Q(is_book=True)
-        return self.search_books(qu, [only_books])
-
-    def search_books(self, query, filters=None, max_results=10):
-        """
-        Searches for Book objects using query
-        """
-        bks = []
-        bks_found = set()
-        query = query.query(is_book=True)
-        res = self.apply_filters(query, filters).field_limit(['book_id'])
-        for r in res:
-            try:
-                bid = r['book_id']
-                if bid not in bks_found:
-                    bks.append(catalogue.models.Book.objects.get(id=bid))
-                    bks_found.add(bid)
-            except catalogue.models.Book.DoesNotExist:
-                pass
-        return bks
-
      @staticmethod
      def apply_filters(query, filters):
          """
      @staticmethod
      def apply_filters(query, filters):
          """