GzipPipelineCachedStorage from fnpdjango.
[wolnelektury.git] / apps / search / index.py
index 32cf5f9..7fb60b5 100644 (file)
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
 from django.conf import settings
 
 import os
 from django.conf import settings
 
 import os
@@ -18,6 +20,7 @@ import sunburnt
 import custom
 import operator
 
 import custom
 import operator
 
+log = logging.getLogger('search')
 
 class SolrIndex(object):
     def __init__(self, mode=None):
 
 class SolrIndex(object):
     def __init__(self, mode=None):
@@ -92,7 +95,8 @@ class Snippets(object):
 
     def close(self):
         """Close snippet file"""
 
     def close(self):
         """Close snippet file"""
-        self.file.close()
+        if self.file:
+            self.file.close()
 
     def remove(self):
         self.revision = None
 
     def remove(self):
         self.revision = None
@@ -111,7 +115,7 @@ class Index(SolrIndex):
     Class indexing books.
     """
     def __init__(self):
     Class indexing books.
     """
     def __init__(self):
-        super(Index, self).__init__()
+        super(Index, self).__init__(mode='rw')
 
     def delete_query(self, *queries):
         """
 
     def delete_query(self, *queries):
         """
@@ -132,7 +136,6 @@ class Index(SolrIndex):
                 for res in ids:
                     uids.add(res['uid'])
                 st += rows
                 for res in ids:
                     uids.add(res['uid'])
                 st += rows
-                #        print "Will delete %s" % ','.join([x for x in uids])
         if uids:
             self.index.delete(uids)
             return True
         if uids:
             self.index.delete(uids)
             return True
@@ -145,6 +148,7 @@ class Index(SolrIndex):
         Removes all tags from index, then index them again.
         Indexed fields include: id, name (with and without polish stems), category
         """
         Removes all tags from index, then index them again.
         Indexed fields include: id, name (with and without polish stems), category
         """
+        log.debug("Indexing tags")
         remove_only = kw.get('remove_only', False)
         # first, remove tags from index.
         if tags:
         remove_only = kw.get('remove_only', False)
         # first, remove tags from index.
         if tags:
@@ -161,7 +165,7 @@ class Index(SolrIndex):
 
                 q_id_cat = self.index.Q(q_id & q_cat)
                 tag_qs.append(q_id_cat)
 
                 q_id_cat = self.index.Q(q_id & q_cat)
                 tag_qs.append(q_id_cat)
-            self.delete_query(tag_qs)
+            self.delete_query(*tag_qs)
         else:  # all
             q = self.index.Q(tag_id__any=True)
             self.delete_query(q)
         else:  # all
             q = self.index.Q(tag_id__any=True)
             self.delete_query(q)
@@ -202,7 +206,6 @@ class Index(SolrIndex):
                         "uid": "tag%d" % tag.id
                         }
                 self.index.add(doc)
                         "uid": "tag%d" % tag.id
                         }
                 self.index.add(doc)
-                print "%s %s" % (doc['tag_name'], doc['tag_category'])
 
     def create_book_doc(self, book):
         """
 
     def create_book_doc(self, book):
         """
@@ -241,7 +244,7 @@ class Index(SolrIndex):
             self.remove_book(book, remove_snippets=False)
 
         book_doc = self.create_book_doc(book)
             self.remove_book(book, remove_snippets=False)
 
         book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
         # let's not index it - it's only used for extracting publish date
         if 'source_name' in meta_fields:
             del meta_fields['source_name']
         # let's not index it - it's only used for extracting publish date
         if 'source_name' in meta_fields:
             del meta_fields['source_name']
@@ -257,6 +260,7 @@ class Index(SolrIndex):
             'authors': meta_fields['authors'],
             'published_date': meta_fields['published_date']
             }
             'authors': meta_fields['authors'],
             'published_date': meta_fields['published_date']
             }
+
         if 'translators' in meta_fields:
             book_fields['translators'] = meta_fields['translators']
 
         if 'translators' in meta_fields:
             book_fields['translators'] = meta_fields['translators']
 
@@ -461,9 +465,7 @@ class Index(SolrIndex):
                         doc = add_part(snippets, header_index=position, header_type=header.tag,
                                        text=u''.join(footnote),
                                        is_footnote=True)
                         doc = add_part(snippets, header_index=position, header_type=header.tag,
                                        text=u''.join(footnote),
                                        is_footnote=True)
-
                         self.index.add(doc)
                         self.index.add(doc)
-                        #print "@ footnote text: %s" % footnote
                         footnote = []
 
                     # handle fragments and themes.
                         footnote = []
 
                     # handle fragments and themes.
@@ -496,7 +498,6 @@ class Index(SolrIndex):
                                        fragment_anchor=fid,
                                        text=fix_format(frag['text']),
                                        themes=frag['themes'])
                                        fragment_anchor=fid,
                                        text=fix_format(frag['text']),
                                        themes=frag['themes'])
-                        #print '@ FRAG %s' % frag['content']
                         self.index.add(doc)
 
                         # Collect content.
                         self.index.add(doc)
 
                         # Collect content.
@@ -509,7 +510,6 @@ class Index(SolrIndex):
                         # in the end, add a section text.
                 doc = add_part(snippets, header_index=position,
                                header_type=header.tag, text=fix_format(content))
                         # in the end, add a section text.
                 doc = add_part(snippets, header_index=position,
                                header_type=header.tag, text=fix_format(content))
-                #print '@ CONTENT: %s' % fix_format(content)
 
                 self.index.add(doc)
 
 
                 self.index.add(doc)
 
@@ -518,12 +518,13 @@ class Index(SolrIndex):
 
 
 class SearchResult(object):
 
 
 class SearchResult(object):
-    def __init__(self, doc, how_found=None, query=None):
+    def __init__(self, doc, how_found=None, query=None, query_terms=None):
         #        self.search = search
         self.boost = 1.0
         self._hits = []
         self._processed_hits = None  # processed hits
         self.snippets = []
         #        self.search = search
         self.boost = 1.0
         self._hits = []
         self._processed_hits = None  # processed hits
         self.snippets = []
+        self.query_terms = query_terms
 
         if 'score' in doc:
             self._score = doc['score']
 
         if 'score' in doc:
             self._score = doc['score']
@@ -546,20 +547,22 @@ class SearchResult(object):
             header_span = header_span is not None and int(header_span) or 1
             fragment = doc.get("fragment_anchor", None)
             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
             header_span = header_span is not None and int(header_span) or 1
             fragment = doc.get("fragment_anchor", None)
             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
-            snippets_rev = doc['snippets_revision']
+            snippets_rev = doc.get('snippets_revision', None)
 
             hit = (sec + (header_span,), fragment, self._score, {
                 'how_found': how_found,
                 'snippets_pos': snippets_pos,
 
             hit = (sec + (header_span,), fragment, self._score, {
                 'how_found': how_found,
                 'snippets_pos': snippets_pos,
-                'snippets_revision': snippets_rev
+                'snippets_revision': snippets_rev,
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', [])
                 })
 
             self._hits.append(hit)
 
     def __unicode__(self):
                 })
 
             self._hits.append(hit)
 
     def __unicode__(self):
-        return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
+        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
-    
+
     def __str__(self):
         return unicode(self).encode('utf-8')
 
     def __str__(self):
         return unicode(self).encode('utf-8')
 
@@ -647,19 +650,25 @@ class SearchResult(object):
             except catalogue.models.Fragment.DoesNotExist:
                 # stale index
                 continue
             except catalogue.models.Fragment.DoesNotExist:
                 # stale index
                 continue
-
             # Figure out if we were searching for a token matching some word in theme name.
             themes = frag.tags.filter(category='theme')
             # Figure out if we were searching for a token matching some word in theme name.
             themes = frag.tags.filter(category='theme')
-            themes_hit = []
-            # if self.searched is not None:
-            #     tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
-            #     for theme in themes:
-            #         name_tokens = self.search.get_tokens(theme.name, 'POLISH')
-            #         for t in tokens:
-            #             if t in name_tokens:
-            #                 if not theme in themes_hit:
-            #                     themes_hit.append(theme)
-            #                 break
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(f[self.OTHER]['themes'])):
+                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(f[self.OTHER]['themes'][i])
+                            break
+
+            def theme_by_name(n):
+                th = filter(lambda t: t.name == n, themes)
+                if th:
+                    return th[0]
+                else:
+                    return None
+            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 
             m = {'score': f[self.SCORE],
                  'fragment': frag,
 
             m = {'score': f[self.SCORE],
                  'fragment': frag,
@@ -713,70 +722,8 @@ class Search(SolrIndex):
     Search facilities.
     """
     def __init__(self, default_field="text"):
     Search facilities.
     """
     def __init__(self, default_field="text"):
-        super(Search, self).__init__()
+        super(Search, self).__init__(mode='r')
 
 
-    # def get_tokens(self, searched, field='text', cached=None):
-    #     """returns tokens analyzed by a proper (for a field) analyzer
-    #     argument can be: StringReader, string/unicode, or tokens. In the last case
-    #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
-    #     """
-    #     if cached is not None and field in cached:
-    #         return cached[field]
-
-    #     if isinstance(searched, str) or isinstance(searched, unicode):
-    #         searched = StringReader(searched)
-    #     elif isinstance(searched, list):
-    #         return searched
-
-    #     searched.reset()
-    #     tokens = self.analyzer.reusableTokenStream(field, searched)
-    #     toks = []
-    #     while tokens.incrementToken():
-    #         cta = tokens.getAttribute(CharTermAttribute.class_)
-    #         toks.append(cta.toString())
-
-    #     if cached is not None:
-    #         cached[field] = toks
-
-    #     return toks
-
-    # @staticmethod
-    # def fuzziness(fuzzy):
-    #     """Helper method to sanitize fuzziness"""
-    #     if not fuzzy:
-    #         return None
-    #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
-    #         return fuzzy
-    #     else:
-    #         return 0.5
-
-    # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
-    #     """
-    #     Return a PhraseQuery with a series of tokens.
-    #     """
-    #     if fuzzy:
-    #         phrase = MultiPhraseQuery()
-    #         for t in tokens:
-    #             term = Term(field, t)
-    #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
-    #             fuzzterms = []
-
-    #             while True:
-    #                 ft = fuzzterm.term()
-    #                 if ft:
-    #                     fuzzterms.append(ft)
-    #                 if not fuzzterm.next(): break
-    #             if fuzzterms:
-    #                 phrase.add(JArray('object')(fuzzterms, Term))
-    #             else:
-    #                 phrase.add(term)
-    #     else:
-    #         phrase = PhraseQuery()
-    #         phrase.setSlop(slop)
-    #         for t in tokens:
-    #             term = Term(field, t)
-    #             phrase.add(term)
-    #     return phrase
 
     def make_term_query(self, query, field='text', modal=operator.or_):
         """
 
     def make_term_query(self, query, field='text', modal=operator.or_):
         """
@@ -784,6 +731,7 @@ class Search(SolrIndex):
         modal - applies to boolean query
         fuzzy - should the query by fuzzy.
         """
         modal - applies to boolean query
         fuzzy - should the query by fuzzy.
         """
+        if query is None: query = ''
         q = self.index.Q()
         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
                         query.split(r" ")), q)
         q = self.index.Q()
         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
                         query.split(r" ")), q)
@@ -802,8 +750,7 @@ class Search(SolrIndex):
         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 
     def search_some(self, searched, fields, book=True,
         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 
     def search_some(self, searched, fields, book=True,
-                    filters=None,
-                    snippets=True):
+                    filters=None, snippets=True, query_terms=None):
         assert isinstance(fields, list)
         if filters is None: filters = []
         if book: filters.append(self.index.Q(is_book=True))
         assert isinstance(fields, list)
         if filters is None: filters = []
         if book: filters.append(self.index.Q(is_book=True))
@@ -816,82 +763,10 @@ class Search(SolrIndex):
         query = self.index.query(query)
         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
         res = query.execute()
         query = self.index.query(query)
         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
         res = query.execute()
-        return [SearchResult(found, how_found='search_some') for found in res]
-
-    # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     """
-    #     Search for perfect book matches. Just see if the query matches with some author or title,
-    #     taking hints into account.
-    #     """
-    #     fields_to_search = ['authors', 'title']
-    #     only_in = None
-    #     if hint:
-    #         if not hint.should_search_for_book():
-    #             return []
-    #         fields_to_search = hint.just_search_in(fields_to_search)
-    #         only_in = hint.book_filter()
-
-    #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
-    #     books = []
-    #     for q in qrys:
-    #         top = self.searcher.search(q,
-    #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-    #             max_results)
-    #         for found in top.scoreDocs:
-    #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
-    #     return books
-
-    # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     fields_to_search = ['tags', 'authors', 'title']
-
-    #     only_in = None
-    #     if hint:
-    #         if not hint.should_search_for_book():
-    #             return []
-    #         fields_to_search = hint.just_search_in(fields_to_search)
-    #         only_in = hint.book_filter()
-
-    #     tokens = self.get_tokens(searched, field='SIMPLE')
-
-    #     q = BooleanQuery()
-
-    #     for fld in fields_to_search:
-    #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
-    #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-    #     books = []
-    #     top = self.searcher.search(q,
-    #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-    #         max_results)
-    #     for found in top.scoreDocs:
-    #         books.append(SearchResult(self, found, how_found="search_book"))
-
-    #     return books
-
-    # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     """
-    #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
-    #     some part/fragment of the book.
-    #     """
-    #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
+        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 
 
-    #     flt = None
-    #     if hint:
-    #         flt = hint.part_filter()
 
 
-    #     books = []
-    #     for q in qrys:
-    #         top = self.searcher.search(q,
-    #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
-    #                                                        flt]),
-    #                                    max_results)
-    #         for found in top.scoreDocs:
-    #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
-    #     return books
-
-    def search_everywhere(self, searched):
+    def search_everywhere(self, searched, query_terms=None):
         """
         Tries to use search terms to match different fields of book (or its parts).
         E.g. one word can be an author survey, another be a part of the title, and the rest
         """
         Tries to use search terms to match different fields of book (or its parts).
         E.g. one word can be an author survey, another be a part of the title, and the rest
@@ -899,7 +774,6 @@ class Search(SolrIndex):
         """
         books = []
         # content only query : themes x content
         """
         books = []
         # content only query : themes x content
-
         q = self.make_term_query(searched, 'text')
         q_themes = self.make_term_query(searched, 'themes_pl')
 
         q = self.make_term_query(searched, 'text')
         q_themes = self.make_term_query(searched, 'themes_pl')
 
@@ -907,7 +781,7 @@ class Search(SolrIndex):
         res = query.execute()
 
         for found in res:
         res = query.execute()
 
         for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent'))
+            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 
         # query themes/content x author/title/tags
         in_content = self.index.Q()
 
         # query themes/content x author/title/tags
         in_content = self.index.Q()
@@ -921,8 +795,9 @@ class Search(SolrIndex):
 
         q = in_content & in_meta
         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 
         q = in_content & in_meta
         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
+
         for found in res:
         for found in res:
-            books.append(SearchResult(found, how_found='search_everywhere'))
+            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 
         return books
 
 
         return books
 
@@ -946,7 +821,6 @@ class Search(SolrIndex):
                     continue
                 text = snippets.get((int(position),
                                      int(length)))
                     continue
                 text = snippets.get((int(position),
                                      int(length)))
-                print "== %s -- %s ==" % (query, text)
                 snip = self.index.highlight(text=text, field=field, q=query)
                 snips[idx] = snip
                 if snip:
                 snip = self.index.highlight(text=text, field=field, q=query)
                 snips[idx] = snip
                 if snip:
@@ -954,7 +828,7 @@ class Search(SolrIndex):
                 idx += 1
 
         except IOError, e:
                 idx += 1
 
         except IOError, e:
-            log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+            log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
             return []
         finally:
             snippets.close()
             return []
         finally:
             snippets.close()
@@ -963,6 +837,7 @@ class Search(SolrIndex):
         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 
         searchresult.snippets = snips
         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 
         searchresult.snippets = snips
+
         return snips
 
     def hint_tags(self, query, pdcounter=True, prefix=True):
         return snips
 
     def hint_tags(self, query, pdcounter=True, prefix=True):
@@ -991,6 +866,8 @@ class Search(SolrIndex):
         res = self.apply_filters(query, filters).execute()
 
         tags = []
         res = self.apply_filters(query, filters).execute()
 
         tags = []
+        pd_tags = []
+
         for doc in res:
             is_pdcounter = doc.get('is_pdcounter', False)
             category = doc.get('tag_category')
         for doc in res:
             is_pdcounter = doc.get('is_pdcounter', False)
             category = doc.get('tag_category')
@@ -1002,17 +879,19 @@ class Search(SolrIndex):
                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
                         tag.category = 'pd_book'  # make it look more lik a tag.
                     else:
                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
                         tag.category = 'pd_book'  # make it look more lik a tag.
                     else:
-                        print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
+                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+                    pd_tags.append(tag)
                 else:
                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
                 else:
                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-                    # don't add the pdcounter tag if same tag already exists
-
-                tags.append(tag)
+                    tags.append(tag)
 
             except catalogue.models.Tag.DoesNotExist: pass
             except PDCounterAuthor.DoesNotExist: pass
             except PDCounterBook.DoesNotExist: pass
 
 
             except catalogue.models.Tag.DoesNotExist: pass
             except PDCounterAuthor.DoesNotExist: pass
             except PDCounterBook.DoesNotExist: pass
 
+        tags_slugs = set(map(lambda t: t.slug, tags))
+        tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
+
         log.debug('search_tags: %s' % tags)
 
         return tags
         log.debug('search_tags: %s' % tags)
 
         return tags
@@ -1038,40 +917,18 @@ class Search(SolrIndex):
         Searches for Book objects using query
         """
         bks = []
         Searches for Book objects using query
         """
         bks = []
+        bks_found = set()
+        query = query.query(is_book=True)
         res = self.apply_filters(query, filters).field_limit(['book_id'])
         for r in res:
             try:
         res = self.apply_filters(query, filters).field_limit(['book_id'])
         for r in res:
             try:
-                bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
+                bid = r['book_id']
+                if not bid in bks_found:
+                    bks.append(catalogue.models.Book.objects.get(id=bid))
+                    bks_found.add(bid)
             except catalogue.models.Book.DoesNotExist: pass
         return bks
  
             except catalogue.models.Book.DoesNotExist: pass
         return bks
  
-    # def make_prefix_phrase(self, toks, field):
-    #     q = MultiPhraseQuery()
-    #     for i in range(len(toks)):
-    #         t = Term(field, toks[i])
-    #         if i == len(toks) - 1:
-    #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
-    #             if pterms:
-    #                 q.add(pterms)
-    #             else:
-    #                 q.add(t)
-    #         else:
-    #             q.add(t)
-    #     return q
-
-    # @staticmethod
-    # def term_filter(term, inverse=False):
-    #     only_term = TermsFilter()
-    #     only_term.addTerm(term)
-
-    #     if inverse:
-    #         neg = BooleanFilter()
-    #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
-    #         only_term = neg
-
-    #     return only_term
-
-
 
     @staticmethod
     def apply_filters(query, filters):
 
     @staticmethod
     def apply_filters(query, filters):
@@ -1083,15 +940,3 @@ class Search(SolrIndex):
         for f in filters:
             query = query.query(f)
         return query
         for f in filters:
             query = query.query(f)
         return query
-
-    # def filtered_categories(self, tags):
-    #     """
-    #     Return a list of tag categories, present in tags list.
-    #     """
-    #     cats = {}
-    #     for t in tags:
-    #         cats[t.category] = True
-    #     return cats.keys()
-
-    # def hint(self):
-    #     return Hint(self)