newsletter by mailchimp

[wolnelektury.git] / src / search / index.py
diff --git a/src/search/index.py b/src/search/index.py

index 31417ca..d3377b1 100644 (file)
--- a/src/search/index.py
+++ b/src/search/index.py
@@ -6,22 +6,29 @@ from django.conf import settings
  
  import os
  import re
  
  import os
  import re
-import errno
  from librarian import dcparser
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
  from librarian import dcparser
  from librarian.parser import WLDocument
  from lxml import etree
  import catalogue.models
+import picture.models
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from itertools import chain
  from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  from itertools import chain
-import traceback
-import logging
-log = logging.getLogger('search')
  import sunburnt
  import custom
  import operator
  import sunburnt
  import custom
  import operator
+import logging
+from wolnelektury.utils import makedirs
  
  log = logging.getLogger('search')
  
  
  log = logging.getLogger('search')
  
+if os.path.isfile(settings.SOLR_STOPWORDS):
+    stopwords = set(
+        line.decode('utf-8').strip()
+        for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
+else:
+    stopwords = set()
+
+
  class SolrIndex(object):
      def __init__(self, mode=None):
          self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  class SolrIndex(object):
      def __init__(self, mode=None):
          self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
@@ -36,20 +43,18 @@ class Snippets(object):
      SNIPPET_DIR = "snippets"
  
      def __init__(self, book_id, revision=None):
      SNIPPET_DIR = "snippets"
  
      def __init__(self, book_id, revision=None):
-        try:
-            os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
-        except OSError as exc:
-            if exc.errno == errno.EEXIST:
-                pass
-            else: raise
+        makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
          self.book_id = book_id
          self.revision = revision
          self.file = None
          self.book_id = book_id
          self.revision = revision
          self.file = None
+        self.position = None
  
      @property
      def path(self):
  
      @property
      def path(self):
-        if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
-        else: fn = "%d" % self.book_id
+        if self.revision:
+            fn = "%d.%d" % (self.book_id, self.revision)
+        else:
+            fn = "%d" % self.book_id
  
          return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  
  
          return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  
@@ -57,7 +62,7 @@ class Snippets(object):
          """
          Open the snippet file. Call .close() afterwards.
          """
          """
          Open the snippet file. Call .close() afterwards.
          """
-        if not 'b' in mode:
+        if 'b' not in mode:
              mode += 'b'
  
          if 'w' in mode:
              mode += 'b'
  
          if 'w' in mode:
@@ -173,8 +178,9 @@ class Index(SolrIndex):
          if not remove_only:
              # then add them [all or just one passed]
              if not tags:
          if not remove_only:
              # then add them [all or just one passed]
              if not tags:
-                tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
-                    PDCounterAuthor.objects.all(), \
+                tags = chain(
+                    catalogue.models.Tag.objects.exclude(category='set'),
+                    PDCounterAuthor.objects.all(),
                      PDCounterBook.objects.all())
  
              for tag in tags:
                      PDCounterBook.objects.all())
  
              for tag in tags:
@@ -211,11 +217,9 @@ class Index(SolrIndex):
          """
          Create a lucene document referring book id.
          """
          """
          Create a lucene document referring book id.
          """
-        doc = {
-            'book_id': int(book.id),
-            }
+        doc = {'book_id': int(book.id)}
          if book.parent is not None:
          if book.parent is not None:
-            doc["parent_id"] = int(book.parent.id)
+            doc['parent_id'] = int(book.parent.id)
          return doc
  
      def remove_book(self, book_or_id, remove_snippets=True):
          return doc
  
      def remove_book(self, book_or_id, remove_snippets=True):
@@ -244,7 +248,8 @@ class Index(SolrIndex):
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
+        meta_fields = self.extract_metadata(book, book_info, dc_only=[
+            'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
          # let's not index it - it's only used for extracting publish date
          if 'source_name' in meta_fields:
              del meta_fields['source_name']
          # let's not index it - it's only used for extracting publish date
          if 'source_name' in meta_fields:
              del meta_fields['source_name']
@@ -261,8 +266,9 @@ class Index(SolrIndex):
              'published_date': meta_fields['published_date']
              }
  
              'published_date': meta_fields['published_date']
              }
  
-        if 'translators' in meta_fields:
-            book_fields['translators'] = meta_fields['translators']
+        for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
+            if tag_name in meta_fields:
+                book_fields[tag_name] = meta_fields[tag_name]
  
          self.index_content(book, book_fields=book_fields)
  
  
          self.index_content(book, book_fields=book_fields)
  
@@ -273,18 +279,19 @@ class Index(SolrIndex):
          'dramat_wierszowany_lp',
          'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
          'wywiad',
          'dramat_wierszowany_lp',
          'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
          'wywiad',
-        ]
+    ]
  
      ignore_content_tags = [
  
      ignore_content_tags = [
-        'uwaga', 'extra',
+        'uwaga', 'extra', 'nota_red', 'abstrakt',
          'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
          'didaskalia',
          'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
          'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
          'didaskalia',
          'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
-        ]
+    ]
  
      footnote_tags = ['pa', 'pt', 'pr', 'pe']
  
  
      footnote_tags = ['pa', 'pt', 'pr', 'pe']
  
-    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
+    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
+                        '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
  
      published_date_re = re.compile("([0-9]+)[\]. ]*$")
  
  
      published_date_re = re.compile("([0-9]+)[\]. ]*$")
  
@@ -298,7 +305,6 @@ class Index(SolrIndex):
              book_info = dcparser.parse(open(book.xml_file.path))
  
          fields['slug'] = book.slug
              book_info = dcparser.parse(open(book.xml_file.path))
  
          fields['slug'] = book.slug
-        fields['tags'] = [t.name  for t in book.tags]
          fields['is_book'] = True
  
          # validator, name
          fields['is_book'] = True
  
          # validator, name
@@ -332,7 +338,8 @@ class Index(SolrIndex):
              match = self.published_date_re.search(book_info.source_name)
              if match is not None:
                  pd = str(match.groups()[0])
              match = self.published_date_re.search(book_info.source_name)
              if match is not None:
                  pd = str(match.groups()[0])
-        if not pd: pd = ""
+        if not pd:
+            pd = ""
          fields["published_date"] = pd
  
          return fields
          fields["published_date"] = pd
  
          return fields
@@ -355,7 +362,7 @@ class Index(SolrIndex):
              if master.tag in self.master_tags:
                  return master
  
              if master.tag in self.master_tags:
                  return master
  
-    def index_content(self, book, book_fields={}):
+    def index_content(self, book, book_fields):
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
@@ -367,9 +374,8 @@ class Index(SolrIndex):
          if master is None:
              return []
  
          if master is None:
              return []
  
-        def walker(node, ignore_tags=[]):
-
-            if node.tag not in ignore_tags:
+        def walker(node):
+            if node.tag not in self.ignore_content_tags:
                  yield node, None, None
                  if node.text is not None:
                      yield None, node.text, None
                  yield node, None, None
                  if node.text is not None:
                      yield None, node.text, None
@@ -383,7 +389,7 @@ class Index(SolrIndex):
              return
  
          def fix_format(text):
              return
  
          def fix_format(text):
-            #            separator = [u" ", u"\t", u".", u";", u","]
+            # separator = [u" ", u"\t", u".", u";", u","]
              if isinstance(text, list):
                  # need to join it first
                  text = filter(lambda s: s is not None, content)
              if isinstance(text, list):
                  # need to join it first
                  text = filter(lambda s: s is not None, content)
@@ -420,17 +426,10 @@ class Index(SolrIndex):
  
              if 'themes' in fields:
                  doc['themes'] = fields['themes']
  
              if 'themes' in fields:
                  doc['themes'] = fields['themes']
-            doc['uid'] = "part%s%s%s" % (doc['header_index'],
-                                         doc['header_span'],
-                                         doc.get('fragment_anchor', ''))
+            doc['uid'] = "part%s-%s-%s-%s" % (
+                book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
              return doc
  
              return doc
  
-        def give_me_utf8(s):
-            if isinstance(s, unicode):
-                return s.encode('utf-8')
-            else:
-                return s
-
          fragments = {}
          snippets = Snippets(book.id).open('w')
          try:
          fragments = {}
          snippets = Snippets(book.id).open('w')
          try:
@@ -451,7 +450,7 @@ class Index(SolrIndex):
                      content.append(text)
                  handle_text = [all_content]
  
                      content.append(text)
                  handle_text = [all_content]
  
-                for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
+                for start, text, end in walker(header):
                      # handle footnotes
                      if start is not None and start.tag in self.footnote_tags:
                          footnote = []
                      # handle footnotes
                      if start is not None and start.tag in self.footnote_tags:
                          footnote = []
@@ -471,12 +470,13 @@ class Index(SolrIndex):
                      # handle fragments and themes.
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
                      # handle fragments and themes.
                      if start is not None and start.tag == 'begin':
                          fid = start.attrib['id'][1:]
-                        fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
+                        fragments[fid] = {
+                            'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
  
                      # themes for this fragment
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
  
                      # themes for this fragment
                      elif start is not None and start.tag == 'motyw':
                          fid = start.attrib['id'][1:]
-                        handle_text.append(None)
+                        handle_text.append(lambda text: None)
                          if start.text is not None:
                              fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
                      elif end is not None and end.tag == 'motyw':
                          if start.text is not None:
                              fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
                      elif end is not None and end.tag == 'motyw':
@@ -487,7 +487,7 @@ class Index(SolrIndex):
                          if fid not in fragments:
                              continue  # a broken <end> node, skip it
                          frag = fragments[fid]
                          if fid not in fragments:
                              continue  # a broken <end> node, skip it
                          frag = fragments[fid]
-                        if frag['themes'] == []:
+                        if not frag['themes']:
                              continue  # empty themes list.
                          del fragments[fid]
  
                              continue  # empty themes list.
                          del fragments[fid]
  
@@ -504,8 +504,7 @@ class Index(SolrIndex):
  
                      if text is not None and handle_text is not []:
                          hdl = handle_text[-1]
  
                      if text is not None and handle_text is not []:
                          hdl = handle_text[-1]
-                        if hdl is not None:
-                            hdl(text)
+                        hdl(text)
  
                          # in the end, add a section text.
                  doc = add_part(snippets, header_index=position,
  
                          # in the end, add a section text.
                  doc = add_part(snippets, header_index=position,
@@ -516,15 +515,56 @@ class Index(SolrIndex):
          finally:
              snippets.close()
  
          finally:
              snippets.close()
  
+    def remove_picture(self, picture_or_id):
+        """Removes a picture from search index."""
+        if isinstance(picture_or_id, picture.models.Picture):
+            picture_id = picture_or_id.id
+        else:
+            picture_id = picture_or_id
+        self.delete_query(self.index.Q(picture_id=picture_id))
+
+    def index_picture(self, picture, picture_info=None, overwrite=True):
+        """
+        Indexes the picture.
+        Creates a lucene document for extracted metadata
+        and calls self.index_area() to index the contents of the picture.
+        """
+        if overwrite:
+            # we don't remove snippets, since they might be still needed by
+            # threads using not reopened index
+            self.remove_picture(picture)
+
+        picture_doc = {'picture_id': int(picture.id)}
+        meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
+            'authors', 'title', 'epochs', 'kinds', 'genres'])
+
+        picture_doc.update(meta_fields)
+
+        picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
+        self.index.add(picture_doc)
+        del picture_doc['is_book']
+        for area in picture.areas.all():
+            self.index_area(area, picture_fields=picture_doc)
+
+    def index_area(self, area, picture_fields):
+        """
+        Indexes themes and objects on the area.
+        """
+        doc = dict(picture_fields)
+        doc['area_id'] = area.id
+        doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
+        doc['uid'] = 'area%s' % area.id
+        self.index.add(doc)
+
  
  class SearchResult(object):
  
  class SearchResult(object):
-    def __init__(self, doc, how_found=None, query=None, query_terms=None):
-        #        self.search = search
+    def __init__(self, doc, how_found=None, query_terms=None):
          self.boost = 1.0
          self._hits = []
          self._processed_hits = None  # processed hits
          self.snippets = []
          self.query_terms = query_terms
          self.boost = 1.0
          self._hits = []
          self._processed_hits = None  # processed hits
          self.snippets = []
          self.query_terms = query_terms
+        self._book = None
  
          if 'score' in doc:
              self._score = doc['score']
  
          if 'score' in doc:
              self._score = doc['score']
@@ -559,9 +599,22 @@ class SearchResult(object):
  
              self._hits.append(hit)
  
  
              self._hits.append(hit)
  
+    @classmethod
+    def from_book(cls, book, how_found=None, query_terms=None):
+        doc = {
+            'score': book.popularity.count,
+            'book_id': book.id,
+            'published_date': 0,
+        }
+        result = cls(doc, how_found=how_found, query_terms=query_terms)
+        result._book = book
+        return result
+
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
      def __unicode__(self):
          return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
-            (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
+            (self.book_id, len(self._hits),
+             len(self._processed_hits) if self._processed_hits else -1,
+             self._score, len(self.snippets))
  
      def __str__(self):
          return unicode(self).encode('utf-8')
  
      def __str__(self):
          return unicode(self).encode('utf-8')
@@ -572,14 +625,13 @@ class SearchResult(object):
  
      def merge(self, other):
          if self.book_id != other.book_id:
  
      def merge(self, other):
          if self.book_id != other.book_id:
-            raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
+            raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
          self._hits += other._hits
          self._hits += other._hits
-        if other.score > self.score:
-            self._score = other._score
+        self._score += max(other._score, 0)
          return self
  
      def get_book(self):
          return self
  
      def get_book(self):
-        if hasattr(self, '_book'):
+        if self._book is not None:
              return self._book
          self._book = catalogue.models.Book.objects.get(id=self.book_id)
          return self._book
              return self._book
          self._book = catalogue.models.Book.objects.get(id=self.book_id)
          return self._book
@@ -605,11 +657,8 @@ class SearchResult(object):
  
          # sections not covered by fragments
          sect = filter(lambda s: 0 == len(filter(
  
          # sections not covered by fragments
          sect = filter(lambda s: 0 == len(filter(
-            lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
-            and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
-            frags)), sect)
-
-        hits = []
+            lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
+                      f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
  
          def remove_duplicates(lst, keyfn, compare):
              els = {}
  
          def remove_duplicates(lst, keyfn, compare):
              els = {}
@@ -679,7 +728,7 @@ class SearchResult(object):
              m.update(f[self.OTHER])
              hits.append(m)
  
              m.update(f[self.OTHER])
              hits.append(m)
  
-        hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
+        hits.sort(key=lambda h: h['score'], reverse=True)
  
          self._processed_hits = hits
  
  
          self._processed_hits = hits
  
@@ -713,93 +762,179 @@ class SearchResult(object):
      def snippet_revision(self, idx=0):
          try:
              return self.hits[idx]['snippets_revision']
      def snippet_revision(self, idx=0):
          try:
              return self.hits[idx]['snippets_revision']
-        except:
+        except (IndexError, KeyError):
              return None
  
  
              return None
  
  
-class Search(SolrIndex):
-    """
-    Search facilities.
-    """
-    def __init__(self, default_field="text"):
-        super(Search, self).__init__(mode='r')
+class PictureResult(object):
+    def __init__(self, doc, how_found=None, query_terms=None):
+        self.boost = 1.0
+        self.query_terms = query_terms
+        self._picture = None
+        self._hits = []
+        self._processed_hits = None
  
  
+        if 'score' in doc:
+            self._score = doc['score']
+        else:
+            self._score = 0
  
  
-    def make_term_query(self, query, field='text', modal=operator.or_):
-        """
-        Returns term queries joined by boolean query.
-        modal - applies to boolean query
-        fuzzy - should the query by fuzzy.
-        """
-        if query is None: query = ''
-        q = self.index.Q()
-        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
-                        query.split(r" ")), q)
+        self.picture_id = int(doc["picture_id"])
  
  
-        return q
+        if doc.get('area_id'):
+            hit = (self._score, {
+                'how_found': how_found,
+                'area_id': doc['area_id'],
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', []),
+            })
  
  
-    def search_phrase(self, searched, field='text', book=False,
-                      filters=None,
-                      snippets=False):
-        if filters is None: filters = []
-        if book: filters.append(self.index.Q(is_book=True))
+            self._hits.append(hit)
  
  
-        q = self.index.query(**{field: searched})
-        q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
-        res = q.execute()
-        return [SearchResult(found, how_found=u'search_phrase') for found in res]
+    def __unicode__(self):
+        return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
  
  
-    def search_some(self, searched, fields, book=True,
-                    filters=None, snippets=True, query_terms=None):
-        assert isinstance(fields, list)
-        if filters is None: filters = []
-        if book: filters.append(self.index.Q(is_book=True))
+    def __repr__(self):
+        return unicode(self)
  
  
-        query = self.index.Q()
+    @property
+    def score(self):
+        return self._score * self.boost
  
  
-        for fld in fields:
-            query = self.index.Q(query | self.make_term_query(searched, fld))
+    def merge(self, other):
+        if self.picture_id != other.picture_id:
+            raise ValueError(
+                "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
+        self._hits += other._hits
+        self._score += max(other._score, 0)
+        return self
  
  
-        query = self.index.query(query)
-        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
-        res = query.execute()
-        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
+    SCORE = 0
+    OTHER = 1
  
  
+    @property
+    def hits(self):
+        if self._processed_hits is not None:
+            return self._processed_hits
  
  
-    def search_everywhere(self, searched, query_terms=None):
-        """
-        Tries to use search terms to match different fields of book (or its parts).
-        E.g. one word can be an author survey, another be a part of the title, and the rest
-        are some words from third chapter.
-        """
-        books = []
-        # content only query : themes x content
-        q = self.make_term_query(searched, 'text')
-        q_themes = self.make_term_query(searched, 'themes_pl')
+        hits = []
+        for hit in self._hits:
+            try:
+                area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
+            except picture.models.PictureArea.DoesNotExist:
+                # stale index
+                continue
+            # Figure out if we were searching for a token matching some word in theme name.
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(hit[self.OTHER]['themes'])):
+                    tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(hit[self.OTHER]['themes'][i])
+                            break
  
  
-        query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
-        res = query.execute()
+            m = {
+                'score': hit[self.SCORE],
+                'area': area,
+                'themes_hit': themes_hit,
+            }
+            m.update(hit[self.OTHER])
+            hits.append(m)
  
  
-        for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
+        hits.sort(key=lambda h: h['score'], reverse=True)
+        hits = hits[:1]
+        self._processed_hits = hits
+        return hits
  
  
-        # query themes/content x author/title/tags
-        in_content = self.index.Q()
-        in_meta = self.index.Q()
+    def get_picture(self):
+        if self._picture is None:
+            self._picture = picture.models.Picture.objects.get(id=self.picture_id)
+        return self._picture
  
  
-        for fld in ['themes_pl', 'text']:
-            in_content |= self.make_term_query(searched, field=fld)
+    picture = property(get_picture)
  
  
-        for fld in ['tags', 'authors', 'title']:
-            in_meta |= self.make_term_query(searched, field=fld)
+    @staticmethod
+    def aggregate(*result_lists):
+        books = {}
+        for rl in result_lists:
+            for r in rl:
+                if r.picture_id in books:
+                    books[r.picture_id].merge(r)
+                else:
+                    books[r.picture_id] = r
+        return books.values()
  
  
-        q = in_content & in_meta
-        res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
+    def __cmp__(self, other):
+        return cmp(self.score, other.score)
  
  
-        for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
  
  
-        return books
+class Search(SolrIndex):
+    """
+    Search facilities.
+    """
+    def __init__(self, default_field="text"):
+        super(Search, self).__init__(mode='r')
+
+    def make_term_query(self, query, field='text', modal=operator.or_):
+        """
+        Returns term queries joined by boolean query.
+        modal - applies to boolean query
+        fuzzy - should the query by fuzzy.
+        """
+        if query is None:
+            query = ''
+        q = self.index.Q()
+        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
+
+        return q
+
+    def search_by_author(self, words):
+        from catalogue.models import Book
+        books = Book.objects.filter(parent=None).order_by('-popularity__count')
+        for word in words:
+            books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
+        return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
+
+    def search_words(self, words, fields, required=None, book=True, picture=False):
+        if book and not picture and fields == ['authors']:
+            return self.search_by_author(words)
+        filters = []
+        for word in words:
+            if book or picture or (word not in stopwords):
+                word_filter = None
+                for field in fields:
+                    q = self.index.Q(**{field: word})
+                    if word_filter is None:
+                        word_filter = q
+                    else:
+                        word_filter |= q
+                filters.append(word_filter)
+        if required:
+            required_filter = None
+            for field in required:
+                for word in words:
+                    if book or picture or (word not in stopwords):
+                        q = self.index.Q(**{field: word})
+                        if required_filter is None:
+                            required_filter = q
+                        else:
+                            required_filter |= q
+            filters.append(required_filter)
+        if not filters:
+            return []
+        params = {}
+        if book:
+            params['is_book'] = True
+        if picture:
+            params['picture_id__gt'] = 0
+        else:
+            params['book_id__gt'] = 0
+        query = self.index.query(**params)
+        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
+        result_class = PictureResult if picture else SearchResult
+        return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
  
      def get_snippets(self, searchresult, query, field='text', num=1):
          """
  
      def get_snippets(self, searchresult, query, field='text', num=1):
          """
@@ -822,13 +957,20 @@ class Search(SolrIndex):
                  text = snippets.get((int(position),
                                       int(length)))
                  snip = self.index.highlight(text=text, field=field, q=query)
                  text = snippets.get((int(position),
                                       int(length)))
                  snip = self.index.highlight(text=text, field=field, q=query)
-                snips[idx] = snip
-                if snip:
-                    num -= 1
+                if not snip and field == 'text':
+                    snip = self.index.highlight(text=text, field='text_nonstem', q=query)
+                if snip not in snips:
+                    snips[idx] = snip
+                    if snip:
+                        num -= 1
                  idx += 1
  
          except IOError, e:
                  idx += 1
  
          except IOError, e:
-            log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
+            book = catalogue.models.Book.objects.filter(id=book_id)
+            if not book:
+                log.error("Book does not exist for book id = %d" % book_id)
+            elif not book.get().children.exists():
+                log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
              return []
          finally:
              snippets.close()
              return []
          finally:
              snippets.close()
@@ -840,102 +982,13 @@ class Search(SolrIndex):
  
          return snips
  
  
          return snips
  
-    def hint_tags(self, query, pdcounter=True, prefix=True):
-        """
-        Return auto-complete hints for tags
-        using prefix search.
-        """
-        q = self.index.Q()
-        query = query.strip()
-        for field in ['tag_name', 'tag_name_pl']:
-            if prefix:
-                q |= self.index.Q(**{field: query + "*"})
-            else:
-                q |= self.make_term_query(query, field=field)
-        qu = self.index.query(q)
-
-        return self.search_tags(qu, pdcounter=pdcounter)
-
-    def search_tags(self, query, filters=None, pdcounter=False):
-        """
-        Search for Tag objects using query.
-        """
-        if not filters: filters = []
-        if not pdcounter:
-            filters.append(~self.index.Q(is_pdcounter=True))
-        res = self.apply_filters(query, filters).execute()
-
-        tags = []
-        pd_tags = []
-
-        for doc in res:
-            is_pdcounter = doc.get('is_pdcounter', False)
-            category = doc.get('tag_category')
-            try:
-                if is_pdcounter == True:
-                    if category == 'pd_author':
-                        tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
-                    elif category == 'pd_book':
-                        tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
-                        tag.category = 'pd_book'  # make it look more lik a tag.
-                    else:
-                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
-                    pd_tags.append(tag)
-                else:
-                    tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-                    tags.append(tag)
-
-            except catalogue.models.Tag.DoesNotExist: pass
-            except PDCounterAuthor.DoesNotExist: pass
-            except PDCounterBook.DoesNotExist: pass
-
-        tags_slugs = set(map(lambda t: t.slug, tags))
-        tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
-
-        log.debug('search_tags: %s' % tags)
-
-        return tags
-
-    def hint_books(self, query, prefix=True):
-        """
-        Returns auto-complete hints for book titles
-        Because we do not index 'pseudo' title-tags.
-        Prefix search.
-        """
-        q = self.index.Q()
-        query = query.strip()
-        if prefix:
-            q |= self.index.Q(title=query + "*")
-        else:
-            q |= self.make_term_query(query, field='title')
-        qu = self.index.query(q)
-        only_books = self.index.Q(is_book=True)
-        return self.search_books(qu, [only_books])
-
-    def search_books(self, query, filters=None, max_results=10):
-        """
-        Searches for Book objects using query
-        """
-        bks = []
-        bks_found = set()
-        query = query.query(is_book=True)
-        res = self.apply_filters(query, filters).field_limit(['book_id'])
-        for r in res:
-            try:
-                bid = r['book_id']
-                if not bid in bks_found:
-                    bks.append(catalogue.models.Book.objects.get(id=bid))
-                    bks_found.add(bid)
-            except catalogue.models.Book.DoesNotExist: pass
-        return bks
- 
-
      @staticmethod
      def apply_filters(query, filters):
          """
          Apply filters to a query
          """
      @staticmethod
      def apply_filters(query, filters):
          """
          Apply filters to a query
          """
-        if filters is None: filters = []
+        if filters is None:
+            filters = []
          filters = filter(lambda x: x is not None, filters)
          for f in filters:
              query = query.query(f)
          filters = filter(lambda x: x is not None, filters)
          for f in filters:
              query = query.query(f)