Searching, filtering fixes.
[wolnelektury.git] / src / search / index.py
index a1c2716..22c9a02 100644 (file)
@@ -1,29 +1,32 @@
-# -*- coding: utf-8 -*-
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
-from django.conf import settings
-
+from functools import reduce, total_ordering
+from itertools import chain
+import logging
+import operator
 import os
 import re
 import os
 import re
+from django.conf import settings
 from librarian import dcparser
 from librarian import dcparser
+import librarian.meta.types.date
+import librarian.meta.types.person
+import librarian.meta.types.text
 from librarian.parser import WLDocument
 from lxml import etree
 from librarian.parser import WLDocument
 from lxml import etree
+import scorched
 import catalogue.models
 import picture.models
 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 import catalogue.models
 import picture.models
 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
-from itertools import chain
-import sunburnt
-import custom
-import operator
-import logging
 from wolnelektury.utils import makedirs
 from wolnelektury.utils import makedirs
+from . import custom
 
 log = logging.getLogger('search')
 
 
 log = logging.getLogger('search')
 
+
 if os.path.isfile(settings.SOLR_STOPWORDS):
     stopwords = set(
 if os.path.isfile(settings.SOLR_STOPWORDS):
     stopwords = set(
-        line.decode('utf-8').strip()
+        line.strip()
         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
 else:
     stopwords = set()
         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
 else:
     stopwords = set()
@@ -95,7 +98,10 @@ class Snippets(object):
         of the snippet stored there.
         """
         self.file.seek(pos[0], 0)
         of the snippet stored there.
         """
         self.file.seek(pos[0], 0)
-        txt = self.file.read(pos[1]).decode('utf-8')
+        try:
+            txt = self.file.read(pos[1]).decode('utf-8')
+        except:
+            return ''
         return txt
 
     def close(self):
         return txt
 
     def close(self):
@@ -122,6 +128,22 @@ class Index(SolrIndex):
     def __init__(self):
         super(Index, self).__init__(mode='rw')
 
     def __init__(self):
         super(Index, self).__init__(mode='rw')
 
+    def remove_snippets(self, book):
+        book.snippet_set.all().delete()
+
+    def add_snippet(self, book, doc):
+        assert book.id == doc.pop('book_id')
+        # Fragments already exist and can be indexed where they live.
+        if 'fragment_anchor' in doc:
+            return
+
+        text = doc.pop('text')
+        header_index = doc.pop('header_index')
+        book.snippet_set.create(
+            sec=header_index,
+            text=text,
+        )
+
     def delete_query(self, *queries):
         """
         index.delete(queries=...) doesn't work, so let's reimplement it
     def delete_query(self, *queries):
         """
         index.delete(queries=...) doesn't work, so let's reimplement it
@@ -129,7 +151,7 @@ class Index(SolrIndex):
         """
         uids = set()
         for q in queries:
         """
         uids = set()
         for q in queries:
-            if isinstance(q, sunburnt.search.LuceneQuery):
+            if isinstance(q, scorched.search.LuceneQuery):
                 q = self.index.query(q)
             q.field_limiter.update(['uid'])
             st = 0
                 q = self.index.query(q)
             q.field_limiter.update(['uid'])
             st = 0
@@ -142,7 +164,8 @@ class Index(SolrIndex):
                     uids.add(res['uid'])
                 st += rows
         if uids:
                     uids.add(res['uid'])
                 st += rows
         if uids:
-            self.index.delete(uids)
+            # FIXME: With Solr API change, this doesn't work.
+            #self.index.delete(uids)
             return True
         else:
             return False
             return True
         else:
             return False
@@ -222,30 +245,29 @@ class Index(SolrIndex):
             doc['parent_id'] = int(book.parent.id)
         return doc
 
             doc['parent_id'] = int(book.parent.id)
         return doc
 
-    def remove_book(self, book_or_id, remove_snippets=True):
+    def remove_book(self, book, remove_snippets=True, legacy=True):
         """Removes a book from search index.
         book - Book instance."""
         """Removes a book from search index.
         book - Book instance."""
-        if isinstance(book_or_id, catalogue.models.Book):
-            book_id = book_or_id.id
-        else:
-            book_id = book_or_id
+        if legacy:
+          self.delete_query(self.index.Q(book_id=book.id))
 
 
-        self.delete_query(self.index.Q(book_id=book_id))
-
-        if remove_snippets:
-            snippets = Snippets(book_id)
+          if remove_snippets:
+            snippets = Snippets(book.id)
             snippets.remove()
             snippets.remove()
+        self.remove_snippets(book)
 
 
-    def index_book(self, book, book_info=None, overwrite=True):
+    def index_book(self, book, book_info=None, overwrite=True, legacy=True):
         """
         Indexes the book.
         Creates a lucene document for extracted metadata
         and calls self.index_content() to index the contents of the book.
         """
         """
         Indexes the book.
         Creates a lucene document for extracted metadata
         and calls self.index_content() to index the contents of the book.
         """
+        if not book.xml_file: return
+
         if overwrite:
             # we don't remove snippets, since they might be still needed by
             # threads using not reopened index
         if overwrite:
             # we don't remove snippets, since they might be still needed by
             # threads using not reopened index
-            self.remove_book(book, remove_snippets=False)
+            self.remove_book(book, remove_snippets=False, legacy=legacy)
 
         book_doc = self.create_book_doc(book)
         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 
         book_doc = self.create_book_doc(book)
         meta_fields = self.extract_metadata(book, book_info, dc_only=[
@@ -258,7 +280,8 @@ class Index(SolrIndex):
             book_doc[n] = f
 
         book_doc['uid'] = "book%s" % book_doc['book_id']
             book_doc[n] = f
 
         book_doc['uid'] = "book%s" % book_doc['book_id']
-        self.index.add(book_doc)
+        if legacy:
+            self.index.add(book_doc)
         del book_doc
         book_fields = {
             'title': meta_fields['title'],
         del book_doc
         book_fields = {
             'title': meta_fields['title'],
@@ -270,7 +293,7 @@ class Index(SolrIndex):
             if tag_name in meta_fields:
                 book_fields[tag_name] = meta_fields[tag_name]
 
             if tag_name in meta_fields:
                 book_fields[tag_name] = meta_fields[tag_name]
 
-        self.index_content(book, book_fields=book_fields)
+        self.index_content(book, book_fields=book_fields, legacy=legacy)
 
     master_tags = [
         'opowiadanie',
 
     master_tags = [
         'opowiadanie',
@@ -302,7 +325,7 @@ class Index(SolrIndex):
         fields = {}
 
         if book_info is None:
         fields = {}
 
         if book_info is None:
-            book_info = dcparser.parse(open(book.xml_file.path))
+            book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
 
         fields['slug'] = book.slug
         fields['is_book'] = True
 
         fields['slug'] = book.slug
         fields['is_book'] = True
@@ -314,21 +337,20 @@ class Index(SolrIndex):
             if hasattr(book_info, field.name):
                 if not getattr(book_info, field.name):
                     continue
             if hasattr(book_info, field.name):
                 if not getattr(book_info, field.name):
                     continue
-                # since no type information is available, we use validator
-                type_indicator = field.validator
-                if type_indicator == dcparser.as_unicode:
+                type_indicator = field.value_type
+                if issubclass(type_indicator, librarian.meta.types.text.TextValue):
                     s = getattr(book_info, field.name)
                     if field.multiple:
                         s = ', '.join(s)
                     fields[field.name] = s
                     s = getattr(book_info, field.name)
                     if field.multiple:
                         s = ', '.join(s)
                     fields[field.name] = s
-                elif type_indicator == dcparser.as_person:
+                elif issubclass(type_indicator, librarian.meta.types.person.Person):
                     p = getattr(book_info, field.name)
                     p = getattr(book_info, field.name)
-                    if isinstance(p, dcparser.Person):
-                        persons = unicode(p)
+                    if isinstance(p, librarian.meta.types.person.Person):
+                        persons = str(p)
                     else:
                     else:
-                        persons = ', '.join(map(unicode, p))
+                        persons = ', '.join(map(str, p))
                     fields[field.name] = persons
                     fields[field.name] = persons
-                elif type_indicator == dcparser.as_date:
+                elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
                     dt = getattr(book_info, field.name)
                     fields[field.name] = dt
 
                     dt = getattr(book_info, field.name)
                     fields[field.name] = dt
 
@@ -362,7 +384,7 @@ class Index(SolrIndex):
             if master.tag in self.master_tags:
                 return master
 
             if master.tag in self.master_tags:
                 return master
 
-    def index_content(self, book, book_fields):
+    def index_content(self, book, book_fields, legacy=True):
         """
         Walks the book XML and extract content from it.
         Adds parts for each header tag and for each fragment.
         """
         Walks the book XML and extract content from it.
         Adds parts for each header tag and for each fragment.
@@ -389,16 +411,16 @@ class Index(SolrIndex):
             return
 
         def fix_format(text):
             return
 
         def fix_format(text):
-            # separator = [u" ", u"\t", u".", u";", u","]
+            # separator = [" ", "\t", ".", ";", ","]
             if isinstance(text, list):
                 # need to join it first
                 text = filter(lambda s: s is not None, content)
             if isinstance(text, list):
                 # need to join it first
                 text = filter(lambda s: s is not None, content)
-                text = u' '.join(text)
+                text = ' '.join(text)
                 # for i in range(len(text)):
                 #     if i > 0:
                 #         if text[i][0] not in separator\
                 #             and text[i - 1][-1] not in separator:
                 # for i in range(len(text)):
                 #     if i > 0:
                 #         if text[i][0] not in separator\
                 #             and text[i - 1][-1] not in separator:
-                #          text.insert(i, u" ")
+                #          text.insert(i, " ")
 
             return re.sub("(?m)/$", "", text)
 
 
             return re.sub("(?m)/$", "", text)
 
@@ -462,9 +484,10 @@ class Index(SolrIndex):
                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                         handle_text.pop()
                         doc = add_part(snippets, header_index=position, header_type=header.tag,
                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                         handle_text.pop()
                         doc = add_part(snippets, header_index=position, header_type=header.tag,
-                                       text=u''.join(footnote),
-                                       is_footnote=True)
-                        self.index.add(doc)
+                                       text=''.join(footnote))
+                        self.add_snippet(book, doc)
+                        if legacy:
+                            self.index.add(doc)
                         footnote = []
 
                     # handle fragments and themes.
                         footnote = []
 
                     # handle fragments and themes.
@@ -478,7 +501,7 @@ class Index(SolrIndex):
                         fid = start.attrib['id'][1:]
                         handle_text.append(lambda text: None)
                         if start.text is not None:
                         fid = start.attrib['id'][1:]
                         handle_text.append(lambda text: None)
                         if start.text is not None:
-                            fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
+                            fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
                     elif end is not None and end.tag == 'motyw':
                         handle_text.pop()
 
                     elif end is not None and end.tag == 'motyw':
                         handle_text.pop()
 
@@ -498,7 +521,10 @@ class Index(SolrIndex):
                                        fragment_anchor=fid,
                                        text=fix_format(frag['text']),
                                        themes=frag['themes'])
                                        fragment_anchor=fid,
                                        text=fix_format(frag['text']),
                                        themes=frag['themes'])
-                        self.index.add(doc)
+                        # Add searchable fragment
+                        self.add_snippet(book, doc)
+                        if legacy:
+                            self.index.add(doc)
 
                         # Collect content.
 
 
                         # Collect content.
 
@@ -510,7 +536,9 @@ class Index(SolrIndex):
                 doc = add_part(snippets, header_index=position,
                                header_type=header.tag, text=fix_format(content))
 
                 doc = add_part(snippets, header_index=position,
                                header_type=header.tag, text=fix_format(content))
 
-                self.index.add(doc)
+                self.add_snippet(book, doc)
+                if legacy:
+                    self.index.add(doc)
 
         finally:
             snippets.close()
 
         finally:
             snippets.close()
@@ -557,6 +585,7 @@ class Index(SolrIndex):
         self.index.add(doc)
 
 
         self.index.add(doc)
 
 
+@total_ordering
 class SearchResult(object):
     def __init__(self, doc, how_found=None, query_terms=None):
         self.boost = 1.0
 class SearchResult(object):
     def __init__(self, doc, how_found=None, query_terms=None):
         self.boost = 1.0
@@ -610,14 +639,14 @@ class SearchResult(object):
         result._book = book
         return result
 
         result._book = book
         return result
 
-    def __unicode__(self):
-        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
+    def __str__(self):
+        return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
             (self.book_id, len(self._hits),
              len(self._processed_hits) if self._processed_hits else -1,
              self._score, len(self.snippets))
 
             (self.book_id, len(self._hits),
              len(self._processed_hits) if self._processed_hits else -1,
              self._score, len(self.snippets))
 
-    def __str__(self):
-        return unicode(self).encode('utf-8')
+    def __bytes__(self):
+        return str(self).encode('utf-8')
 
     @property
     def score(self):
 
     @property
     def score(self):
@@ -633,7 +662,10 @@ class SearchResult(object):
     def get_book(self):
         if self._book is not None:
             return self._book
     def get_book(self):
         if self._book is not None:
             return self._book
-        self._book = catalogue.models.Book.objects.get(id=self.book_id)
+        try:
+            self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
+        except catalogue.models.Book.DoesNotExist:
+            self._book = None
         return self._book
 
     book = property(get_book)
         return self._book
 
     book = property(get_book)
@@ -653,27 +685,25 @@ class SearchResult(object):
         # to sections and fragments
         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 
         # to sections and fragments
         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 
-        sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
+        sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 
         # sections not covered by fragments
 
         # sections not covered by fragments
-        sect = filter(lambda s: 0 == len(filter(
+        sect = filter(lambda s: 0 == len(list(filter(
             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
-                      f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
+                      f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 
 
-        def remove_duplicates(lst, keyfn, compare):
+        def remove_duplicates(lst, keyfn, larger):
             els = {}
             for e in lst:
                 eif = keyfn(e)
                 if eif in els:
             els = {}
             for e in lst:
                 eif = keyfn(e)
                 if eif in els:
-                    if compare(els[eif], e) >= 1:
+                    if larger(els[eif], e):
                         continue
                 els[eif] = e
             return els.values()
 
         # remove fragments with duplicated fid's and duplicated snippets
                         continue
                 els[eif] = e
             return els.values()
 
         # remove fragments with duplicated fid's and duplicated snippets
-        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
-        # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
-        #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
+        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 
         # remove duplicate sections
         sections = {}
 
         # remove duplicate sections
         sections = {}
@@ -691,7 +721,7 @@ class SearchResult(object):
             m.update(s[self.OTHER])
             sections[si] = m
 
             m.update(s[self.OTHER])
             sections[si] = m
 
-        hits = sections.values()
+        hits = list(sections.values())
 
         for f in frags:
             try:
 
         for f in frags:
             try:
@@ -705,19 +735,19 @@ class SearchResult(object):
             if self.query_terms is not None:
                 for i in range(0, len(f[self.OTHER]['themes'])):
                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
             if self.query_terms is not None:
                 for i in range(0, len(f[self.OTHER]['themes'])):
                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
-                    tms = map(unicode.lower, tms)
+                    tms = map(str.lower, tms)
                     for qt in self.query_terms:
                         if qt in tms:
                             themes_hit.add(f[self.OTHER]['themes'][i])
                             break
 
             def theme_by_name(n):
                     for qt in self.query_terms:
                         if qt in tms:
                             themes_hit.add(f[self.OTHER]['themes'][i])
                             break
 
             def theme_by_name(n):
-                th = filter(lambda t: t.name == n, themes)
+                th = list(filter(lambda t: t.name == n, themes))
                 if th:
                     return th[0]
                 else:
                     return None
                 if th:
                     return th[0]
                 else:
                     return None
-            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
+            themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 
             m = {'score': f[self.SCORE],
                  'fragment': frag,
 
             m = {'score': f[self.SCORE],
                  'fragment': frag,
@@ -745,13 +775,17 @@ class SearchResult(object):
                     books[r.book_id] = r
         return books.values()
 
                     books[r.book_id] = r
         return books.values()
 
-    def __cmp__(self, other):
-        c = cmp(self.score, other.score)
-        if c == 0:
-            # this is inverted, because earlier date is better
-            return cmp(other.published_date, self.published_date)
-        else:
-            return c
+    def get_sort_key(self):
+        return (-self.score,
+                self.published_date,
+                self.book.sort_key_author if self.book else '',
+                self.book.sort_key if self.book else '')
+
+    def __lt__(self, other):
+        return self.get_sort_key() > other.get_sort_key()
+
+    def __eq__(self, other):
+        return self.get_sort_key() == other.get_sort_key()
 
     def __len__(self):
         return len(self.hits)
 
     def __len__(self):
         return len(self.hits)
@@ -766,6 +800,7 @@ class SearchResult(object):
             return None
 
 
             return None
 
 
+@total_ordering
 class PictureResult(object):
     def __init__(self, doc, how_found=None, query_terms=None):
         self.boost = 1.0
 class PictureResult(object):
     def __init__(self, doc, how_found=None, query_terms=None):
         self.boost = 1.0
@@ -791,11 +826,11 @@ class PictureResult(object):
 
             self._hits.append(hit)
 
 
             self._hits.append(hit)
 
-    def __unicode__(self):
-        return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
+    def __str__(self):
+        return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 
     def __repr__(self):
 
     def __repr__(self):
-        return unicode(self)
+        return str(self)
 
     @property
     def score(self):
 
     @property
     def score(self):
@@ -829,7 +864,7 @@ class PictureResult(object):
             if self.query_terms is not None:
                 for i in range(0, len(hit[self.OTHER]['themes'])):
                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
             if self.query_terms is not None:
                 for i in range(0, len(hit[self.OTHER]['themes'])):
                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
-                    tms = map(unicode.lower, tms)
+                    tms = map(str.lower, tms)
                     for qt in self.query_terms:
                         if qt in tms:
                             themes_hit.add(hit[self.OTHER]['themes'][i])
                     for qt in self.query_terms:
                         if qt in tms:
                             themes_hit.add(hit[self.OTHER]['themes'][i])
@@ -866,8 +901,11 @@ class PictureResult(object):
                     books[r.picture_id] = r
         return books.values()
 
                     books[r.picture_id] = r
         return books.values()
 
-    def __cmp__(self, other):
-        return cmp(self.score, other.score)
+    def __lt__(self, other):
+        return self.score < other.score
+
+    def __eq__(self, other):
+        return self.score == other.score
 
 
 class Search(SolrIndex):
 
 
 class Search(SolrIndex):
@@ -892,7 +930,7 @@ class Search(SolrIndex):
 
     def search_by_author(self, words):
         from catalogue.models import Book
 
     def search_by_author(self, words):
         from catalogue.models import Book
-        books = Book.objects.filter(parent=None).order_by('-popularity__count')
+        books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
         for word in words:
             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
         for word in words:
             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
@@ -957,14 +995,16 @@ class Search(SolrIndex):
                 text = snippets.get((int(position),
                                      int(length)))
                 snip = self.index.highlight(text=text, field=field, q=query)
                 text = snippets.get((int(position),
                                      int(length)))
                 snip = self.index.highlight(text=text, field=field, q=query)
+                if not snip and field == 'text':
+                    snip = self.index.highlight(text=text, field='text_nonstem', q=query)
                 if snip not in snips:
                     snips[idx] = snip
                     if snip:
                         num -= 1
                 idx += 1
 
                 if snip not in snips:
                     snips[idx] = snip
                     if snip:
                         num -= 1
                 idx += 1
 
-        except IOError, e:
-            book = catalogue.models.Book.objects.filter(id=book_id)
+        except IOError as e:
+            book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
             if not book:
                 log.error("Book does not exist for book id = %d" % book_id)
             elif not book.get().children.exists():
             if not book:
                 log.error("Book does not exist for book id = %d" % book_id)
             elif not book.get().children.exists():
@@ -973,8 +1013,8 @@ class Search(SolrIndex):
         finally:
             snippets.close()
 
         finally:
             snippets.close()
 
-            # remove verse end markers..
-        snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
+        # remove verse end markers..
+        snips = [s.replace("/\n", "\n") if s else s for s in snips]
 
         searchresult.snippets = snips
 
 
         searchresult.snippets = snips