Searching, filtering fixes.
[wolnelektury.git] / src / search / index.py
index 31417ca..22c9a02 100644 (file)
@@ -1,27 +1,37 @@
-# -*- coding: utf-8 -*-
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
-from django.conf import settings
-
+from functools import reduce, total_ordering
+from itertools import chain
+import logging
+import operator
 import os
 import re
 import os
 import re
-import errno
+from django.conf import settings
 from librarian import dcparser
 from librarian import dcparser
+import librarian.meta.types.date
+import librarian.meta.types.person
+import librarian.meta.types.text
 from librarian.parser import WLDocument
 from lxml import etree
 from librarian.parser import WLDocument
 from lxml import etree
+import scorched
 import catalogue.models
 import catalogue.models
+import picture.models
 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
-from itertools import chain
-import traceback
-import logging
-log = logging.getLogger('search')
-import sunburnt
-import custom
-import operator
+from wolnelektury.utils import makedirs
+from . import custom
 
 log = logging.getLogger('search')
 
 
 log = logging.getLogger('search')
 
+
+if os.path.isfile(settings.SOLR_STOPWORDS):
+    stopwords = set(
+        line.strip()
+        for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
+else:
+    stopwords = set()
+
+
 class SolrIndex(object):
     def __init__(self, mode=None):
         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
 class SolrIndex(object):
     def __init__(self, mode=None):
         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
@@ -36,20 +46,18 @@ class Snippets(object):
     SNIPPET_DIR = "snippets"
 
     def __init__(self, book_id, revision=None):
     SNIPPET_DIR = "snippets"
 
     def __init__(self, book_id, revision=None):
-        try:
-            os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
-        except OSError as exc:
-            if exc.errno == errno.EEXIST:
-                pass
-            else: raise
+        makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
         self.book_id = book_id
         self.revision = revision
         self.file = None
         self.book_id = book_id
         self.revision = revision
         self.file = None
+        self.position = None
 
     @property
     def path(self):
 
     @property
     def path(self):
-        if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
-        else: fn = "%d" % self.book_id
+        if self.revision:
+            fn = "%d.%d" % (self.book_id, self.revision)
+        else:
+            fn = "%d" % self.book_id
 
         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 
 
         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 
@@ -57,7 +65,7 @@ class Snippets(object):
         """
         Open the snippet file. Call .close() afterwards.
         """
         """
         Open the snippet file. Call .close() afterwards.
         """
-        if not 'b' in mode:
+        if 'b' not in mode:
             mode += 'b'
 
         if 'w' in mode:
             mode += 'b'
 
         if 'w' in mode:
@@ -90,7 +98,10 @@ class Snippets(object):
         of the snippet stored there.
         """
         self.file.seek(pos[0], 0)
         of the snippet stored there.
         """
         self.file.seek(pos[0], 0)
-        txt = self.file.read(pos[1]).decode('utf-8')
+        try:
+            txt = self.file.read(pos[1]).decode('utf-8')
+        except:
+            return ''
         return txt
 
     def close(self):
         return txt
 
     def close(self):
@@ -117,6 +128,22 @@ class Index(SolrIndex):
     def __init__(self):
         super(Index, self).__init__(mode='rw')
 
     def __init__(self):
         super(Index, self).__init__(mode='rw')
 
+    def remove_snippets(self, book):
+        book.snippet_set.all().delete()
+
+    def add_snippet(self, book, doc):
+        assert book.id == doc.pop('book_id')
+        # Fragments already exist and can be indexed where they live.
+        if 'fragment_anchor' in doc:
+            return
+
+        text = doc.pop('text')
+        header_index = doc.pop('header_index')
+        book.snippet_set.create(
+            sec=header_index,
+            text=text,
+        )
+
     def delete_query(self, *queries):
         """
         index.delete(queries=...) doesn't work, so let's reimplement it
     def delete_query(self, *queries):
         """
         index.delete(queries=...) doesn't work, so let's reimplement it
@@ -124,7 +151,7 @@ class Index(SolrIndex):
         """
         uids = set()
         for q in queries:
         """
         uids = set()
         for q in queries:
-            if isinstance(q, sunburnt.search.LuceneQuery):
+            if isinstance(q, scorched.search.LuceneQuery):
                 q = self.index.query(q)
             q.field_limiter.update(['uid'])
             st = 0
                 q = self.index.query(q)
             q.field_limiter.update(['uid'])
             st = 0
@@ -137,7 +164,8 @@ class Index(SolrIndex):
                     uids.add(res['uid'])
                 st += rows
         if uids:
                     uids.add(res['uid'])
                 st += rows
         if uids:
-            self.index.delete(uids)
+            # FIXME: With Solr API change, this doesn't work.
+            #self.index.delete(uids)
             return True
         else:
             return False
             return True
         else:
             return False
@@ -173,8 +201,9 @@ class Index(SolrIndex):
         if not remove_only:
             # then add them [all or just one passed]
             if not tags:
         if not remove_only:
             # then add them [all or just one passed]
             if not tags:
-                tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
-                    PDCounterAuthor.objects.all(), \
+                tags = chain(
+                    catalogue.models.Tag.objects.exclude(category='set'),
+                    PDCounterAuthor.objects.all(),
                     PDCounterBook.objects.all())
 
             for tag in tags:
                     PDCounterBook.objects.all())
 
             for tag in tags:
@@ -211,40 +240,38 @@ class Index(SolrIndex):
         """
         Create a lucene document referring book id.
         """
         """
         Create a lucene document referring book id.
         """
-        doc = {
-            'book_id': int(book.id),
-            }
+        doc = {'book_id': int(book.id)}
         if book.parent is not None:
         if book.parent is not None:
-            doc["parent_id"] = int(book.parent.id)
+            doc['parent_id'] = int(book.parent.id)
         return doc
 
         return doc
 
-    def remove_book(self, book_or_id, remove_snippets=True):
+    def remove_book(self, book, remove_snippets=True, legacy=True):
         """Removes a book from search index.
         book - Book instance."""
         """Removes a book from search index.
         book - Book instance."""
-        if isinstance(book_or_id, catalogue.models.Book):
-            book_id = book_or_id.id
-        else:
-            book_id = book_or_id
-
-        self.delete_query(self.index.Q(book_id=book_id))
+        if legacy:
+          self.delete_query(self.index.Q(book_id=book.id))
 
 
-        if remove_snippets:
-            snippets = Snippets(book_id)
+          if remove_snippets:
+            snippets = Snippets(book.id)
             snippets.remove()
             snippets.remove()
+        self.remove_snippets(book)
 
 
-    def index_book(self, book, book_info=None, overwrite=True):
+    def index_book(self, book, book_info=None, overwrite=True, legacy=True):
         """
         Indexes the book.
         Creates a lucene document for extracted metadata
         and calls self.index_content() to index the contents of the book.
         """
         """
         Indexes the book.
         Creates a lucene document for extracted metadata
         and calls self.index_content() to index the contents of the book.
         """
+        if not book.xml_file: return
+
         if overwrite:
             # we don't remove snippets, since they might be still needed by
             # threads using not reopened index
         if overwrite:
             # we don't remove snippets, since they might be still needed by
             # threads using not reopened index
-            self.remove_book(book, remove_snippets=False)
+            self.remove_book(book, remove_snippets=False, legacy=legacy)
 
         book_doc = self.create_book_doc(book)
 
         book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
+        meta_fields = self.extract_metadata(book, book_info, dc_only=[
+            'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
         # let's not index it - it's only used for extracting publish date
         if 'source_name' in meta_fields:
             del meta_fields['source_name']
         # let's not index it - it's only used for extracting publish date
         if 'source_name' in meta_fields:
             del meta_fields['source_name']
@@ -253,7 +280,8 @@ class Index(SolrIndex):
             book_doc[n] = f
 
         book_doc['uid'] = "book%s" % book_doc['book_id']
             book_doc[n] = f
 
         book_doc['uid'] = "book%s" % book_doc['book_id']
-        self.index.add(book_doc)
+        if legacy:
+            self.index.add(book_doc)
         del book_doc
         book_fields = {
             'title': meta_fields['title'],
         del book_doc
         book_fields = {
             'title': meta_fields['title'],
@@ -261,10 +289,11 @@ class Index(SolrIndex):
             'published_date': meta_fields['published_date']
             }
 
             'published_date': meta_fields['published_date']
             }
 
-        if 'translators' in meta_fields:
-            book_fields['translators'] = meta_fields['translators']
+        for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
+            if tag_name in meta_fields:
+                book_fields[tag_name] = meta_fields[tag_name]
 
 
-        self.index_content(book, book_fields=book_fields)
+        self.index_content(book, book_fields=book_fields, legacy=legacy)
 
     master_tags = [
         'opowiadanie',
 
     master_tags = [
         'opowiadanie',
@@ -273,18 +302,19 @@ class Index(SolrIndex):
         'dramat_wierszowany_lp',
         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
         'wywiad',
         'dramat_wierszowany_lp',
         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
         'wywiad',
-        ]
+    ]
 
     ignore_content_tags = [
 
     ignore_content_tags = [
-        'uwaga', 'extra',
+        'uwaga', 'extra', 'nota_red', 'abstrakt',
         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
         'didaskalia',
         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
         'didaskalia',
         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
-        ]
+    ]
 
     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 
 
     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 
-    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
+    skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
+                        '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 
     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 
 
     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 
@@ -295,10 +325,9 @@ class Index(SolrIndex):
         fields = {}
 
         if book_info is None:
         fields = {}
 
         if book_info is None:
-            book_info = dcparser.parse(open(book.xml_file.path))
+            book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
 
         fields['slug'] = book.slug
 
         fields['slug'] = book.slug
-        fields['tags'] = [t.name  for t in book.tags]
         fields['is_book'] = True
 
         # validator, name
         fields['is_book'] = True
 
         # validator, name
@@ -308,21 +337,20 @@ class Index(SolrIndex):
             if hasattr(book_info, field.name):
                 if not getattr(book_info, field.name):
                     continue
             if hasattr(book_info, field.name):
                 if not getattr(book_info, field.name):
                     continue
-                # since no type information is available, we use validator
-                type_indicator = field.validator
-                if type_indicator == dcparser.as_unicode:
+                type_indicator = field.value_type
+                if issubclass(type_indicator, librarian.meta.types.text.TextValue):
                     s = getattr(book_info, field.name)
                     if field.multiple:
                         s = ', '.join(s)
                     fields[field.name] = s
                     s = getattr(book_info, field.name)
                     if field.multiple:
                         s = ', '.join(s)
                     fields[field.name] = s
-                elif type_indicator == dcparser.as_person:
+                elif issubclass(type_indicator, librarian.meta.types.person.Person):
                     p = getattr(book_info, field.name)
                     p = getattr(book_info, field.name)
-                    if isinstance(p, dcparser.Person):
-                        persons = unicode(p)
+                    if isinstance(p, librarian.meta.types.person.Person):
+                        persons = str(p)
                     else:
                     else:
-                        persons = ', '.join(map(unicode, p))
+                        persons = ', '.join(map(str, p))
                     fields[field.name] = persons
                     fields[field.name] = persons
-                elif type_indicator == dcparser.as_date:
+                elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
                     dt = getattr(book_info, field.name)
                     fields[field.name] = dt
 
                     dt = getattr(book_info, field.name)
                     fields[field.name] = dt
 
@@ -332,7 +360,8 @@ class Index(SolrIndex):
             match = self.published_date_re.search(book_info.source_name)
             if match is not None:
                 pd = str(match.groups()[0])
             match = self.published_date_re.search(book_info.source_name)
             if match is not None:
                 pd = str(match.groups()[0])
-        if not pd: pd = ""
+        if not pd:
+            pd = ""
         fields["published_date"] = pd
 
         return fields
         fields["published_date"] = pd
 
         return fields
@@ -355,7 +384,7 @@ class Index(SolrIndex):
             if master.tag in self.master_tags:
                 return master
 
             if master.tag in self.master_tags:
                 return master
 
-    def index_content(self, book, book_fields={}):
+    def index_content(self, book, book_fields, legacy=True):
         """
         Walks the book XML and extract content from it.
         Adds parts for each header tag and for each fragment.
         """
         Walks the book XML and extract content from it.
         Adds parts for each header tag and for each fragment.
@@ -367,9 +396,8 @@ class Index(SolrIndex):
         if master is None:
             return []
 
         if master is None:
             return []
 
-        def walker(node, ignore_tags=[]):
-
-            if node.tag not in ignore_tags:
+        def walker(node):
+            if node.tag not in self.ignore_content_tags:
                 yield node, None, None
                 if node.text is not None:
                     yield None, node.text, None
                 yield node, None, None
                 if node.text is not None:
                     yield None, node.text, None
@@ -383,16 +411,16 @@ class Index(SolrIndex):
             return
 
         def fix_format(text):
             return
 
         def fix_format(text):
-            #            separator = [u" ", u"\t", u".", u";", u","]
+            # separator = [" ", "\t", ".", ";", ","]
             if isinstance(text, list):
                 # need to join it first
                 text = filter(lambda s: s is not None, content)
             if isinstance(text, list):
                 # need to join it first
                 text = filter(lambda s: s is not None, content)
-                text = u' '.join(text)
+                text = ' '.join(text)
                 # for i in range(len(text)):
                 #     if i > 0:
                 #         if text[i][0] not in separator\
                 #             and text[i - 1][-1] not in separator:
                 # for i in range(len(text)):
                 #     if i > 0:
                 #         if text[i][0] not in separator\
                 #             and text[i - 1][-1] not in separator:
-                #          text.insert(i, u" ")
+                #          text.insert(i, " ")
 
             return re.sub("(?m)/$", "", text)
 
 
             return re.sub("(?m)/$", "", text)
 
@@ -420,17 +448,10 @@ class Index(SolrIndex):
 
             if 'themes' in fields:
                 doc['themes'] = fields['themes']
 
             if 'themes' in fields:
                 doc['themes'] = fields['themes']
-            doc['uid'] = "part%s%s%s" % (doc['header_index'],
-                                         doc['header_span'],
-                                         doc.get('fragment_anchor', ''))
+            doc['uid'] = "part%s-%s-%s-%s" % (
+                book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
             return doc
 
             return doc
 
-        def give_me_utf8(s):
-            if isinstance(s, unicode):
-                return s.encode('utf-8')
-            else:
-                return s
-
         fragments = {}
         snippets = Snippets(book.id).open('w')
         try:
         fragments = {}
         snippets = Snippets(book.id).open('w')
         try:
@@ -451,7 +472,7 @@ class Index(SolrIndex):
                     content.append(text)
                 handle_text = [all_content]
 
                     content.append(text)
                 handle_text = [all_content]
 
-                for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
+                for start, text, end in walker(header):
                     # handle footnotes
                     if start is not None and start.tag in self.footnote_tags:
                         footnote = []
                     # handle footnotes
                     if start is not None and start.tag in self.footnote_tags:
                         footnote = []
@@ -463,22 +484,24 @@ class Index(SolrIndex):
                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                         handle_text.pop()
                         doc = add_part(snippets, header_index=position, header_type=header.tag,
                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                         handle_text.pop()
                         doc = add_part(snippets, header_index=position, header_type=header.tag,
-                                       text=u''.join(footnote),
-                                       is_footnote=True)
-                        self.index.add(doc)
+                                       text=''.join(footnote))
+                        self.add_snippet(book, doc)
+                        if legacy:
+                            self.index.add(doc)
                         footnote = []
 
                     # handle fragments and themes.
                     if start is not None and start.tag == 'begin':
                         fid = start.attrib['id'][1:]
                         footnote = []
 
                     # handle fragments and themes.
                     if start is not None and start.tag == 'begin':
                         fid = start.attrib['id'][1:]
-                        fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
+                        fragments[fid] = {
+                            'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 
                     # themes for this fragment
                     elif start is not None and start.tag == 'motyw':
                         fid = start.attrib['id'][1:]
 
                     # themes for this fragment
                     elif start is not None and start.tag == 'motyw':
                         fid = start.attrib['id'][1:]
-                        handle_text.append(None)
+                        handle_text.append(lambda text: None)
                         if start.text is not None:
                         if start.text is not None:
-                            fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
+                            fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
                     elif end is not None and end.tag == 'motyw':
                         handle_text.pop()
 
                     elif end is not None and end.tag == 'motyw':
                         handle_text.pop()
 
@@ -487,7 +510,7 @@ class Index(SolrIndex):
                         if fid not in fragments:
                             continue  # a broken <end> node, skip it
                         frag = fragments[fid]
                         if fid not in fragments:
                             continue  # a broken <end> node, skip it
                         frag = fragments[fid]
-                        if frag['themes'] == []:
+                        if not frag['themes']:
                             continue  # empty themes list.
                         del fragments[fid]
 
                             continue  # empty themes list.
                         del fragments[fid]
 
@@ -498,33 +521,79 @@ class Index(SolrIndex):
                                        fragment_anchor=fid,
                                        text=fix_format(frag['text']),
                                        themes=frag['themes'])
                                        fragment_anchor=fid,
                                        text=fix_format(frag['text']),
                                        themes=frag['themes'])
-                        self.index.add(doc)
+                        # Add searchable fragment
+                        self.add_snippet(book, doc)
+                        if legacy:
+                            self.index.add(doc)
 
                         # Collect content.
 
                     if text is not None and handle_text is not []:
                         hdl = handle_text[-1]
 
                         # Collect content.
 
                     if text is not None and handle_text is not []:
                         hdl = handle_text[-1]
-                        if hdl is not None:
-                            hdl(text)
+                        hdl(text)
 
                         # in the end, add a section text.
                 doc = add_part(snippets, header_index=position,
                                header_type=header.tag, text=fix_format(content))
 
 
                         # in the end, add a section text.
                 doc = add_part(snippets, header_index=position,
                                header_type=header.tag, text=fix_format(content))
 
-                self.index.add(doc)
+                self.add_snippet(book, doc)
+                if legacy:
+                    self.index.add(doc)
 
         finally:
             snippets.close()
 
 
         finally:
             snippets.close()
 
+    def remove_picture(self, picture_or_id):
+        """Removes a picture from search index."""
+        if isinstance(picture_or_id, picture.models.Picture):
+            picture_id = picture_or_id.id
+        else:
+            picture_id = picture_or_id
+        self.delete_query(self.index.Q(picture_id=picture_id))
+
+    def index_picture(self, picture, picture_info=None, overwrite=True):
+        """
+        Indexes the picture.
+        Creates a lucene document for extracted metadata
+        and calls self.index_area() to index the contents of the picture.
+        """
+        if overwrite:
+            # we don't remove snippets, since they might be still needed by
+            # threads using not reopened index
+            self.remove_picture(picture)
+
+        picture_doc = {'picture_id': int(picture.id)}
+        meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
+            'authors', 'title', 'epochs', 'kinds', 'genres'])
+
+        picture_doc.update(meta_fields)
+
+        picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
+        self.index.add(picture_doc)
+        del picture_doc['is_book']
+        for area in picture.areas.all():
+            self.index_area(area, picture_fields=picture_doc)
 
 
+    def index_area(self, area, picture_fields):
+        """
+        Indexes themes and objects on the area.
+        """
+        doc = dict(picture_fields)
+        doc['area_id'] = area.id
+        doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
+        doc['uid'] = 'area%s' % area.id
+        self.index.add(doc)
+
+
+@total_ordering
 class SearchResult(object):
 class SearchResult(object):
-    def __init__(self, doc, how_found=None, query=None, query_terms=None):
-        #        self.search = search
+    def __init__(self, doc, how_found=None, query_terms=None):
         self.boost = 1.0
         self._hits = []
         self._processed_hits = None  # processed hits
         self.snippets = []
         self.query_terms = query_terms
         self.boost = 1.0
         self._hits = []
         self._processed_hits = None  # processed hits
         self.snippets = []
         self.query_terms = query_terms
+        self._book = None
 
         if 'score' in doc:
             self._score = doc['score']
 
         if 'score' in doc:
             self._score = doc['score']
@@ -559,12 +628,25 @@ class SearchResult(object):
 
             self._hits.append(hit)
 
 
             self._hits.append(hit)
 
-    def __unicode__(self):
-        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
-            (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
+    @classmethod
+    def from_book(cls, book, how_found=None, query_terms=None):
+        doc = {
+            'score': book.popularity.count,
+            'book_id': book.id,
+            'published_date': 0,
+        }
+        result = cls(doc, how_found=how_found, query_terms=query_terms)
+        result._book = book
+        return result
 
     def __str__(self):
 
     def __str__(self):
-        return unicode(self).encode('utf-8')
+        return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
+            (self.book_id, len(self._hits),
+             len(self._processed_hits) if self._processed_hits else -1,
+             self._score, len(self.snippets))
+
+    def __bytes__(self):
+        return str(self).encode('utf-8')
 
     @property
     def score(self):
 
     @property
     def score(self):
@@ -572,16 +654,18 @@ class SearchResult(object):
 
     def merge(self, other):
         if self.book_id != other.book_id:
 
     def merge(self, other):
         if self.book_id != other.book_id:
-            raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
+            raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
         self._hits += other._hits
         self._hits += other._hits
-        if other.score > self.score:
-            self._score = other._score
+        self._score += max(other._score, 0)
         return self
 
     def get_book(self):
         return self
 
     def get_book(self):
-        if hasattr(self, '_book'):
+        if self._book is not None:
             return self._book
             return self._book
-        self._book = catalogue.models.Book.objects.get(id=self.book_id)
+        try:
+            self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
+        except catalogue.models.Book.DoesNotExist:
+            self._book = None
         return self._book
 
     book = property(get_book)
         return self._book
 
     book = property(get_book)
@@ -601,30 +685,25 @@ class SearchResult(object):
         # to sections and fragments
         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 
         # to sections and fragments
         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 
-        sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
+        sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 
         # sections not covered by fragments
 
         # sections not covered by fragments
-        sect = filter(lambda s: 0 == len(filter(
-            lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
-            and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
-            frags)), sect)
+        sect = filter(lambda s: 0 == len(list(filter(
+            lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
+                      f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 
 
-        hits = []
-
-        def remove_duplicates(lst, keyfn, compare):
+        def remove_duplicates(lst, keyfn, larger):
             els = {}
             for e in lst:
                 eif = keyfn(e)
                 if eif in els:
             els = {}
             for e in lst:
                 eif = keyfn(e)
                 if eif in els:
-                    if compare(els[eif], e) >= 1:
+                    if larger(els[eif], e):
                         continue
                 els[eif] = e
             return els.values()
 
         # remove fragments with duplicated fid's and duplicated snippets
                         continue
                 els[eif] = e
             return els.values()
 
         # remove fragments with duplicated fid's and duplicated snippets
-        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
-        # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
-        #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
+        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 
         # remove duplicate sections
         sections = {}
 
         # remove duplicate sections
         sections = {}
@@ -642,7 +721,7 @@ class SearchResult(object):
             m.update(s[self.OTHER])
             sections[si] = m
 
             m.update(s[self.OTHER])
             sections[si] = m
 
-        hits = sections.values()
+        hits = list(sections.values())
 
         for f in frags:
             try:
 
         for f in frags:
             try:
@@ -656,19 +735,19 @@ class SearchResult(object):
             if self.query_terms is not None:
                 for i in range(0, len(f[self.OTHER]['themes'])):
                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
             if self.query_terms is not None:
                 for i in range(0, len(f[self.OTHER]['themes'])):
                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
-                    tms = map(unicode.lower, tms)
+                    tms = map(str.lower, tms)
                     for qt in self.query_terms:
                         if qt in tms:
                             themes_hit.add(f[self.OTHER]['themes'][i])
                             break
 
             def theme_by_name(n):
                     for qt in self.query_terms:
                         if qt in tms:
                             themes_hit.add(f[self.OTHER]['themes'][i])
                             break
 
             def theme_by_name(n):
-                th = filter(lambda t: t.name == n, themes)
+                th = list(filter(lambda t: t.name == n, themes))
                 if th:
                     return th[0]
                 else:
                     return None
                 if th:
                     return th[0]
                 else:
                     return None
-            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
+            themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 
             m = {'score': f[self.SCORE],
                  'fragment': frag,
 
             m = {'score': f[self.SCORE],
                  'fragment': frag,
@@ -679,7 +758,7 @@ class SearchResult(object):
             m.update(f[self.OTHER])
             hits.append(m)
 
             m.update(f[self.OTHER])
             hits.append(m)
 
-        hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
+        hits.sort(key=lambda h: h['score'], reverse=True)
 
         self._processed_hits = hits
 
 
         self._processed_hits = hits
 
@@ -696,13 +775,17 @@ class SearchResult(object):
                     books[r.book_id] = r
         return books.values()
 
                     books[r.book_id] = r
         return books.values()
 
-    def __cmp__(self, other):
-        c = cmp(self.score, other.score)
-        if c == 0:
-            # this is inverted, because earlier date is better
-            return cmp(other.published_date, self.published_date)
-        else:
-            return c
+    def get_sort_key(self):
+        return (-self.score,
+                self.published_date,
+                self.book.sort_key_author if self.book else '',
+                self.book.sort_key if self.book else '')
+
+    def __lt__(self, other):
+        return self.get_sort_key() > other.get_sort_key()
+
+    def __eq__(self, other):
+        return self.get_sort_key() == other.get_sort_key()
 
     def __len__(self):
         return len(self.hits)
 
     def __len__(self):
         return len(self.hits)
@@ -713,93 +796,183 @@ class SearchResult(object):
     def snippet_revision(self, idx=0):
         try:
             return self.hits[idx]['snippets_revision']
     def snippet_revision(self, idx=0):
         try:
             return self.hits[idx]['snippets_revision']
-        except:
+        except (IndexError, KeyError):
             return None
 
 
             return None
 
 
-class Search(SolrIndex):
-    """
-    Search facilities.
-    """
-    def __init__(self, default_field="text"):
-        super(Search, self).__init__(mode='r')
+@total_ordering
+class PictureResult(object):
+    def __init__(self, doc, how_found=None, query_terms=None):
+        self.boost = 1.0
+        self.query_terms = query_terms
+        self._picture = None
+        self._hits = []
+        self._processed_hits = None
 
 
+        if 'score' in doc:
+            self._score = doc['score']
+        else:
+            self._score = 0
 
 
-    def make_term_query(self, query, field='text', modal=operator.or_):
-        """
-        Returns term queries joined by boolean query.
-        modal - applies to boolean query
-        fuzzy - should the query by fuzzy.
-        """
-        if query is None: query = ''
-        q = self.index.Q()
-        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
-                        query.split(r" ")), q)
+        self.picture_id = int(doc["picture_id"])
 
 
-        return q
+        if doc.get('area_id'):
+            hit = (self._score, {
+                'how_found': how_found,
+                'area_id': doc['area_id'],
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', []),
+            })
+
+            self._hits.append(hit)
+
+    def __str__(self):
+        return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 
 
-    def search_phrase(self, searched, field='text', book=False,
-                      filters=None,
-                      snippets=False):
-        if filters is None: filters = []
-        if book: filters.append(self.index.Q(is_book=True))
+    def __repr__(self):
+        return str(self)
 
 
-        q = self.index.query(**{field: searched})
-        q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
-        res = q.execute()
-        return [SearchResult(found, how_found=u'search_phrase') for found in res]
+    @property
+    def score(self):
+        return self._score * self.boost
 
 
-    def search_some(self, searched, fields, book=True,
-                    filters=None, snippets=True, query_terms=None):
-        assert isinstance(fields, list)
-        if filters is None: filters = []
-        if book: filters.append(self.index.Q(is_book=True))
+    def merge(self, other):
+        if self.picture_id != other.picture_id:
+            raise ValueError(
+                "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
+        self._hits += other._hits
+        self._score += max(other._score, 0)
+        return self
 
 
-        query = self.index.Q()
+    SCORE = 0
+    OTHER = 1
 
 
-        for fld in fields:
-            query = self.index.Q(query | self.make_term_query(searched, fld))
+    @property
+    def hits(self):
+        if self._processed_hits is not None:
+            return self._processed_hits
 
 
-        query = self.index.query(query)
-        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
-        res = query.execute()
-        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
+        hits = []
+        for hit in self._hits:
+            try:
+                area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
+            except picture.models.PictureArea.DoesNotExist:
+                # stale index
+                continue
+            # Figure out if we were searching for a token matching some word in theme name.
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(hit[self.OTHER]['themes'])):
+                    tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(str.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(hit[self.OTHER]['themes'][i])
+                            break
 
 
+            m = {
+                'score': hit[self.SCORE],
+                'area': area,
+                'themes_hit': themes_hit,
+            }
+            m.update(hit[self.OTHER])
+            hits.append(m)
 
 
-    def search_everywhere(self, searched, query_terms=None):
-        """
-        Tries to use search terms to match different fields of book (or its parts).
-        E.g. one word can be an author survey, another be a part of the title, and the rest
-        are some words from third chapter.
-        """
-        books = []
-        # content only query : themes x content
-        q = self.make_term_query(searched, 'text')
-        q_themes = self.make_term_query(searched, 'themes_pl')
+        hits.sort(key=lambda h: h['score'], reverse=True)
+        hits = hits[:1]
+        self._processed_hits = hits
+        return hits
+
+    def get_picture(self):
+        if self._picture is None:
+            self._picture = picture.models.Picture.objects.get(id=self.picture_id)
+        return self._picture
 
 
-        query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
-        res = query.execute()
+    picture = property(get_picture)
+
+    @staticmethod
+    def aggregate(*result_lists):
+        books = {}
+        for rl in result_lists:
+            for r in rl:
+                if r.picture_id in books:
+                    books[r.picture_id].merge(r)
+                else:
+                    books[r.picture_id] = r
+        return books.values()
 
 
-        for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
+    def __lt__(self, other):
+        return self.score < other.score
 
 
-        # query themes/content x author/title/tags
-        in_content = self.index.Q()
-        in_meta = self.index.Q()
+    def __eq__(self, other):
+        return self.score == other.score
 
 
-        for fld in ['themes_pl', 'text']:
-            in_content |= self.make_term_query(searched, field=fld)
 
 
-        for fld in ['tags', 'authors', 'title']:
-            in_meta |= self.make_term_query(searched, field=fld)
+class Search(SolrIndex):
+    """
+    Search facilities.
+    """
+    def __init__(self, default_field="text"):
+        super(Search, self).__init__(mode='r')
 
 
-        q = in_content & in_meta
-        res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
+    def make_term_query(self, query, field='text', modal=operator.or_):
+        """
+        Returns term queries joined by boolean query.
+        modal - applies to boolean query
+        fuzzy - should the query by fuzzy.
+        """
+        if query is None:
+            query = ''
+        q = self.index.Q()
+        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 
 
-        for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
+        return q
 
 
-        return books
+    def search_by_author(self, words):
+        from catalogue.models import Book
+        books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
+        for word in words:
+            books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
+        return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
+
+    def search_words(self, words, fields, required=None, book=True, picture=False):
+        if book and not picture and fields == ['authors']:
+            return self.search_by_author(words)
+        filters = []
+        for word in words:
+            if book or picture or (word not in stopwords):
+                word_filter = None
+                for field in fields:
+                    q = self.index.Q(**{field: word})
+                    if word_filter is None:
+                        word_filter = q
+                    else:
+                        word_filter |= q
+                filters.append(word_filter)
+        if required:
+            required_filter = None
+            for field in required:
+                for word in words:
+                    if book or picture or (word not in stopwords):
+                        q = self.index.Q(**{field: word})
+                        if required_filter is None:
+                            required_filter = q
+                        else:
+                            required_filter |= q
+            filters.append(required_filter)
+        if not filters:
+            return []
+        params = {}
+        if book:
+            params['is_book'] = True
+        if picture:
+            params['picture_id__gt'] = 0
+        else:
+            params['book_id__gt'] = 0
+        query = self.index.query(**params)
+        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
+        result_class = PictureResult if picture else SearchResult
+        return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 
     def get_snippets(self, searchresult, query, field='text', num=1):
         """
 
     def get_snippets(self, searchresult, query, field='text', num=1):
         """
@@ -822,120 +995,38 @@ class Search(SolrIndex):
                 text = snippets.get((int(position),
                                      int(length)))
                 snip = self.index.highlight(text=text, field=field, q=query)
                 text = snippets.get((int(position),
                                      int(length)))
                 snip = self.index.highlight(text=text, field=field, q=query)
-                snips[idx] = snip
-                if snip:
-                    num -= 1
+                if not snip and field == 'text':
+                    snip = self.index.highlight(text=text, field='text_nonstem', q=query)
+                if snip not in snips:
+                    snips[idx] = snip
+                    if snip:
+                        num -= 1
                 idx += 1
 
                 idx += 1
 
-        except IOError, e:
-            log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
+        except IOError as e:
+            book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
+            if not book:
+                log.error("Book does not exist for book id = %d" % book_id)
+            elif not book.get().children.exists():
+                log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
             return []
         finally:
             snippets.close()
 
             return []
         finally:
             snippets.close()
 
-            # remove verse end markers..
-        snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
+        # remove verse end markers..
+        snips = [s.replace("/\n", "\n") if s else s for s in snips]
 
         searchresult.snippets = snips
 
         return snips
 
 
         searchresult.snippets = snips
 
         return snips
 
-    def hint_tags(self, query, pdcounter=True, prefix=True):
-        """
-        Return auto-complete hints for tags
-        using prefix search.
-        """
-        q = self.index.Q()
-        query = query.strip()
-        for field in ['tag_name', 'tag_name_pl']:
-            if prefix:
-                q |= self.index.Q(**{field: query + "*"})
-            else:
-                q |= self.make_term_query(query, field=field)
-        qu = self.index.query(q)
-
-        return self.search_tags(qu, pdcounter=pdcounter)
-
-    def search_tags(self, query, filters=None, pdcounter=False):
-        """
-        Search for Tag objects using query.
-        """
-        if not filters: filters = []
-        if not pdcounter:
-            filters.append(~self.index.Q(is_pdcounter=True))
-        res = self.apply_filters(query, filters).execute()
-
-        tags = []
-        pd_tags = []
-
-        for doc in res:
-            is_pdcounter = doc.get('is_pdcounter', False)
-            category = doc.get('tag_category')
-            try:
-                if is_pdcounter == True:
-                    if category == 'pd_author':
-                        tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
-                    elif category == 'pd_book':
-                        tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
-                        tag.category = 'pd_book'  # make it look more lik a tag.
-                    else:
-                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
-                    pd_tags.append(tag)
-                else:
-                    tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-                    tags.append(tag)
-
-            except catalogue.models.Tag.DoesNotExist: pass
-            except PDCounterAuthor.DoesNotExist: pass
-            except PDCounterBook.DoesNotExist: pass
-
-        tags_slugs = set(map(lambda t: t.slug, tags))
-        tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
-
-        log.debug('search_tags: %s' % tags)
-
-        return tags
-
-    def hint_books(self, query, prefix=True):
-        """
-        Returns auto-complete hints for book titles
-        Because we do not index 'pseudo' title-tags.
-        Prefix search.
-        """
-        q = self.index.Q()
-        query = query.strip()
-        if prefix:
-            q |= self.index.Q(title=query + "*")
-        else:
-            q |= self.make_term_query(query, field='title')
-        qu = self.index.query(q)
-        only_books = self.index.Q(is_book=True)
-        return self.search_books(qu, [only_books])
-
-    def search_books(self, query, filters=None, max_results=10):
-        """
-        Searches for Book objects using query
-        """
-        bks = []
-        bks_found = set()
-        query = query.query(is_book=True)
-        res = self.apply_filters(query, filters).field_limit(['book_id'])
-        for r in res:
-            try:
-                bid = r['book_id']
-                if not bid in bks_found:
-                    bks.append(catalogue.models.Book.objects.get(id=bid))
-                    bks_found.add(bid)
-            except catalogue.models.Book.DoesNotExist: pass
-        return bks
-
     @staticmethod
     def apply_filters(query, filters):
         """
         Apply filters to a query
         """
     @staticmethod
     def apply_filters(query, filters):
         """
         Apply filters to a query
         """
-        if filters is None: filters = []
+        if filters is None:
+            filters = []
         filters = filter(lambda x: x is not None, filters)
         for f in filters:
             query = query.query(f)
         filters = filter(lambda x: x is not None, filters)
         for f in filters:
             query = query.query(f)