Cleaning: timezone issues, deprecated urls.py imports, missing notes.

[wolnelektury.git] / apps / search / index.py
diff --git a/apps/search/index.py b/apps/search/index.py

index e7f28c9..7fb60b5 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -1,5 +1,7 @@
  # -*- coding: utf-8 -*-
  # -*- coding: utf-8 -*-
-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
  from django.conf import settings
  
  import os
  from django.conf import settings
  
  import os
@@ -15,12 +17,14 @@ import traceback
  import logging
  log = logging.getLogger('search')
  import sunburnt
  import logging
  log = logging.getLogger('search')
  import sunburnt
-import highlight
+import custom
+import operator
  
  
+log = logging.getLogger('search')
  
  class SolrIndex(object):
      def __init__(self, mode=None):
  
  class SolrIndex(object):
      def __init__(self, mode=None):
-        self.index = highlight.HLSolrInterface(settings.SOLR, mode=mode)
+        self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  
  
  class Snippets(object):
  
  
  class Snippets(object):
@@ -91,7 +95,8 @@ class Snippets(object):
  
      def close(self):
          """Close snippet file"""
  
      def close(self):
          """Close snippet file"""
-        self.file.close()
+        if self.file:
+            self.file.close()
  
      def remove(self):
          self.revision = None
  
      def remove(self):
          self.revision = None
@@ -110,7 +115,7 @@ class Index(SolrIndex):
      Class indexing books.
      """
      def __init__(self):
      Class indexing books.
      """
      def __init__(self):
-        super(Index, self).__init__()
+        super(Index, self).__init__(mode='rw')
  
      def delete_query(self, *queries):
          """
  
      def delete_query(self, *queries):
          """
@@ -130,8 +135,7 @@ class Index(SolrIndex):
                      break
                  for res in ids:
                      uids.add(res['uid'])
                      break
                  for res in ids:
                      uids.add(res['uid'])
-                st+=rows
-                #        print "Will delete %s" % ','.join([x for x in uids])
+                st += rows
          if uids:
              self.index.delete(uids)
              return True
          if uids:
              self.index.delete(uids)
              return True
@@ -144,6 +148,7 @@ class Index(SolrIndex):
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
          Removes all tags from index, then index them again.
          Indexed fields include: id, name (with and without polish stems), category
          """
+        log.debug("Indexing tags")
          remove_only = kw.get('remove_only', False)
          # first, remove tags from index.
          if tags:
          remove_only = kw.get('remove_only', False)
          # first, remove tags from index.
          if tags:
@@ -160,7 +165,7 @@ class Index(SolrIndex):
  
                  q_id_cat = self.index.Q(q_id & q_cat)
                  tag_qs.append(q_id_cat)
  
                  q_id_cat = self.index.Q(q_id & q_cat)
                  tag_qs.append(q_id_cat)
-            self.delete_query(tag_qs)
+            self.delete_query(*tag_qs)
          else:  # all
              q = self.index.Q(tag_id__any=True)
              self.delete_query(q)
          else:  # all
              q = self.index.Q(tag_id__any=True)
              self.delete_query(q)
@@ -179,7 +184,8 @@ class Index(SolrIndex):
                          "tag_name": tag.name,
                          "tag_name_pl": tag.name,
                          "tag_category": 'pd_author',
                          "tag_name": tag.name,
                          "tag_name_pl": tag.name,
                          "tag_category": 'pd_author',
-                        "is_pdcounter": True
+                        "is_pdcounter": True,
+                        "uid": "tag%d_pd_a" % tag.id
                          }
                  elif isinstance(tag, PDCounterBook):
                      doc = {
                          }
                  elif isinstance(tag, PDCounterBook):
                      doc = {
@@ -187,7 +193,8 @@ class Index(SolrIndex):
                          "tag_name": tag.title,
                          "tag_name_pl": tag.title,
                          "tag_category": 'pd_book',
                          "tag_name": tag.title,
                          "tag_name_pl": tag.title,
                          "tag_category": 'pd_book',
-                        "is_pdcounter": True
+                        "is_pdcounter": True,
+                        "uid": "tag%d_pd_b" % tag.id
                          }
                  else:
                      doc = {
                          }
                  else:
                      doc = {
@@ -195,9 +202,9 @@ class Index(SolrIndex):
                          "tag_name": tag.name,
                          "tag_name_pl": tag.name,
                          "tag_category": tag.category,
                          "tag_name": tag.name,
                          "tag_name_pl": tag.name,
                          "tag_category": tag.category,
-                        "is_pdcounter": False
+                        "is_pdcounter": False,
+                        "uid": "tag%d" % tag.id
                          }
                          }
-                doc['uid'] = "tag%d" % tag.id
                  self.index.add(doc)
  
      def create_book_doc(self, book):
                  self.index.add(doc)
  
      def create_book_doc(self, book):
@@ -237,7 +244,7 @@ class Index(SolrIndex):
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
              self.remove_book(book, remove_snippets=False)
  
          book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+        meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
          # let's not index it - it's only used for extracting publish date
          if 'source_name' in meta_fields:
              del meta_fields['source_name']
          # let's not index it - it's only used for extracting publish date
          if 'source_name' in meta_fields:
              del meta_fields['source_name']
@@ -248,11 +255,16 @@ class Index(SolrIndex):
          book_doc['uid'] = "book%s" % book_doc['book_id']
          self.index.add(book_doc)
          del book_doc
          book_doc['uid'] = "book%s" % book_doc['book_id']
          self.index.add(book_doc)
          del book_doc
-
-        self.index_content(book, book_fields={
+        book_fields = {
              'title': meta_fields['title'],
              'authors': meta_fields['authors'],
              'title': meta_fields['title'],
              'authors': meta_fields['authors'],
-            'published_date': meta_fields['published_date']})
+            'published_date': meta_fields['published_date']
+            }
+
+        if 'translators' in meta_fields:
+            book_fields['translators'] = meta_fields['translators']
+
+        self.index_content(book, book_fields=book_fields)
  
      master_tags = [
          'opowiadanie',
  
      master_tags = [
          'opowiadanie',
@@ -410,7 +422,7 @@ class Index(SolrIndex):
                  doc['themes'] = fields['themes']
              doc['uid'] = "part%s%s%s" % (doc['header_index'],
                                           doc['header_span'],
                  doc['themes'] = fields['themes']
              doc['uid'] = "part%s%s%s" % (doc['header_index'],
                                           doc['header_span'],
-                                         doc.get('fragment_anchor',''))
+                                         doc.get('fragment_anchor', ''))
              return doc
  
          def give_me_utf8(s):
              return doc
  
          def give_me_utf8(s):
@@ -453,9 +465,7 @@ class Index(SolrIndex):
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
                                         text=u''.join(footnote),
                                         is_footnote=True)
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
                                         text=u''.join(footnote),
                                         is_footnote=True)
-
                          self.index.add(doc)
                          self.index.add(doc)
-                        #print "@ footnote text: %s" % footnote
                          footnote = []
  
                      # handle fragments and themes.
                          footnote = []
  
                      # handle fragments and themes.
@@ -488,7 +498,6 @@ class Index(SolrIndex):
                                         fragment_anchor=fid,
                                         text=fix_format(frag['text']),
                                         themes=frag['themes'])
                                         fragment_anchor=fid,
                                         text=fix_format(frag['text']),
                                         themes=frag['themes'])
-                        #print '@ FRAG %s' % frag['content']
                          self.index.add(doc)
  
                          # Collect content.
                          self.index.add(doc)
  
                          # Collect content.
@@ -501,7 +510,6 @@ class Index(SolrIndex):
                          # in the end, add a section text.
                  doc = add_part(snippets, header_index=position,
                                 header_type=header.tag, text=fix_format(content))
                          # in the end, add a section text.
                  doc = add_part(snippets, header_index=position,
                                 header_type=header.tag, text=fix_format(content))
-                #print '@ CONTENT: %s' % fix_format(content)
  
                  self.index.add(doc)
  
  
                  self.index.add(doc)
  
@@ -509,47 +517,54 @@ class Index(SolrIndex):
              snippets.close()
  
  
              snippets.close()
  
  
-
  class SearchResult(object):
  class SearchResult(object):
-    def __init__(self, search, doc, how_found=None, snippets=None, searched=None, tokens_cache=None):
-        if tokens_cache is None: tokens_cache = {}
+    def __init__(self, doc, how_found=None, query=None, query_terms=None):
+        #        self.search = search
+        self.boost = 1.0
+        self._hits = []
+        self._processed_hits = None  # processed hits
+        self.snippets = []
+        self.query_terms = query_terms
  
          if 'score' in doc:
              self._score = doc['score']
          else:
              self._score = 0
  
  
          if 'score' in doc:
              self._score = doc['score']
          else:
              self._score = 0
  
-        self.boost = 1.0
-
-        self._hits = []
-        self._processed_hits = None  # processed hits
-
          self.book_id = int(doc["book_id"])
  
          self.book_id = int(doc["book_id"])
  
-        pd = doc["published_date"]
          try:
          try:
-            self.published_date = int(pd)
+            self.published_date = int(doc.get("published_date"))
          except ValueError:
              self.published_date = 0
  
          except ValueError:
              self.published_date = 0
  
+        # content hits
          header_type = doc.get("header_type", None)
          # we have a content hit in some header of fragment
          if header_type is not None:
              sec = (header_type, int(doc["header_index"]))
              header_span = doc['header_span']
              header_span = header_span is not None and int(header_span) or 1
          header_type = doc.get("header_type", None)
          # we have a content hit in some header of fragment
          if header_type is not None:
              sec = (header_type, int(doc["header_index"]))
              header_span = doc['header_span']
              header_span = header_span is not None and int(header_span) or 1
-
              fragment = doc.get("fragment_anchor", None)
              fragment = doc.get("fragment_anchor", None)
+            snippets_pos = (doc['snippets_position'], doc['snippets_length'])
+            snippets_rev = doc.get('snippets_revision', None)
  
  
-            if snippets:
-                snippets = snippets.replace("/\n", "\n")
-            hit = (sec + (header_span,), fragment, self._score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+            hit = (sec + (header_span,), fragment, self._score, {
+                'how_found': how_found,
+                'snippets_pos': snippets_pos,
+                'snippets_revision': snippets_rev,
+                'themes': doc.get('themes', []),
+                'themes_pl': doc.get('themes_pl', [])
+                })
  
              self._hits.append(hit)
  
  
              self._hits.append(hit)
  
-        self.search = search
-        self.searched = searched
-        self.tokens_cache = tokens_cache
+    def __unicode__(self):
+        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
+            (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
+
+    def __str__(self):
+        return unicode(self).encode('utf-8')
  
      @property
      def score(self):
  
      @property
      def score(self):
@@ -566,31 +581,32 @@ class SearchResult(object):
      def get_book(self):
          if hasattr(self, '_book'):
              return self._book
      def get_book(self):
          if hasattr(self, '_book'):
              return self._book
-        return catalogue.models.Book.objects.get(id=self.book_id)
+        self._book = catalogue.models.Book.objects.get(id=self.book_id)
+        return self._book
  
      book = property(get_book)
  
  
      book = property(get_book)
  
+    POSITION = 0
+    FRAGMENT = 1
+    POSITION_INDEX = 1
+    POSITION_SPAN = 2
+    SCORE = 2
+    OTHER = 3
+
      @property
      def hits(self):
          if self._processed_hits is not None:
              return self._processed_hits
  
      @property
      def hits(self):
          if self._processed_hits is not None:
              return self._processed_hits
  
-        POSITION = 0
-        FRAGMENT = 1
-        POSITION_INDEX = 1
-        POSITION_SPAN = 2
-        SCORE = 2
-        OTHER = 3
-
          # to sections and fragments
          # to sections and fragments
-        frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
+        frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
  
  
-        sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
+        sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
  
          # sections not covered by fragments
          sect = filter(lambda s: 0 == len(filter(
  
          # sections not covered by fragments
          sect = filter(lambda s: 0 == len(filter(
-            lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
-            and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
+            lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
+            and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
              frags)), sect)
  
          hits = []
              frags)), sect)
  
          hits = []
@@ -606,55 +622,61 @@ class SearchResult(object):
              return els.values()
  
          # remove fragments with duplicated fid's and duplicated snippets
              return els.values()
  
          # remove fragments with duplicated fid's and duplicated snippets
-        frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
-        frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
-                                  lambda a, b: cmp(a[SCORE], b[SCORE]))
+        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
+        # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
+        #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
  
          # remove duplicate sections
          sections = {}
  
          for s in sect:
  
          # remove duplicate sections
          sections = {}
  
          for s in sect:
-            si = s[POSITION][POSITION_INDEX]
+            si = s[self.POSITION][self.POSITION_INDEX]
              # skip existing
              if si in sections:
              # skip existing
              if si in sections:
-                if sections[si]['score'] >= s[SCORE]:
+                if sections[si]['score'] >= s[self.SCORE]:
                      continue
  
                      continue
  
-            m = {'score': s[SCORE],
-                 'section_number': s[POSITION][POSITION_INDEX] + 1,
+            m = {'score': s[self.SCORE],
+                 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
                   }
                   }
-            m.update(s[OTHER])
+            m.update(s[self.OTHER])
              sections[si] = m
  
          hits = sections.values()
  
          for f in frags:
              try:
              sections[si] = m
  
          hits = sections.values()
  
          for f in frags:
              try:
-                frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
+                frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
              except catalogue.models.Fragment.DoesNotExist:
                  # stale index
                  continue
              except catalogue.models.Fragment.DoesNotExist:
                  # stale index
                  continue
-
              # Figure out if we were searching for a token matching some word in theme name.
              themes = frag.tags.filter(category='theme')
              # Figure out if we were searching for a token matching some word in theme name.
              themes = frag.tags.filter(category='theme')
-            themes_hit = []
-            if self.searched is not None:
-                tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
-                for theme in themes:
-                    name_tokens = self.search.get_tokens(theme.name, 'POLISH')
-                    for t in tokens:
-                        if t in name_tokens:
-                            if not theme in themes_hit:
-                                themes_hit.append(theme)
+            themes_hit = set()
+            if self.query_terms is not None:
+                for i in range(0, len(f[self.OTHER]['themes'])):
+                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
+                    tms = map(unicode.lower, tms)
+                    for qt in self.query_terms:
+                        if qt in tms:
+                            themes_hit.add(f[self.OTHER]['themes'][i])
                              break
  
                              break
  
-            m = {'score': f[SCORE],
+            def theme_by_name(n):
+                th = filter(lambda t: t.name == n, themes)
+                if th:
+                    return th[0]
+                else:
+                    return None
+            themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
+
+            m = {'score': f[self.SCORE],
                   'fragment': frag,
                   'fragment': frag,
-                 'section_number': f[POSITION][POSITION_INDEX] + 1,
+                 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
                   'themes': themes,
                   'themes_hit': themes_hit
                   }
                   'themes': themes,
                   'themes_hit': themes_hit
                   }
-            m.update(f[OTHER])
+            m.update(f[self.OTHER])
              hits.append(m)
  
          hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
              hits.append(m)
  
          hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
@@ -663,9 +685,6 @@ class SearchResult(object):
  
          return hits
  
  
          return hits
  
-    def __unicode__(self):
-        return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
-
      @staticmethod
      def aggregate(*result_lists):
          books = {}
      @staticmethod
      def aggregate(*result_lists):
          books = {}
@@ -685,624 +704,239 @@ class SearchResult(object):
          else:
              return c
  
          else:
              return c
  
+    def __len__(self):
+        return len(self.hits)
  
  
-class Hint(object):
-    """
-    Given some hint information (information we already know about)
-    our search target - like author, title (specific book), epoch, genre, kind
-    we can narrow down search using filters.
-    """
-    def __init__(self, search):
-        """
-        Accepts a Searcher instance.
-        """
-        self.search = search
-        self.book_tags = {}
-        self.part_tags = []
-        self._books = []
-
-    def books(self, *books):
-        """
-        Give a hint that we search these books.
-        """
-        self._books = books
+    def snippet_pos(self, idx=0):
+        return self.hits[idx]['snippets_pos']
  
  
-    def tags(self, tags):
-        """
-        Give a hint that these Tag objects (a list of)
-        is necessary.
-        """
-        for t in tags:
-            if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
-                lst = self.book_tags.get(t.category, [])
-                lst.append(t)
-                self.book_tags[t.category] = lst
-            if t.category in ['theme', 'theme_pl']:
-                self.part_tags.append(t)
-
-    def tag_filter(self, tags, field='tags'):
-        """
-        Given a lsit of tags and an optional field (but they are normally in tags field)
-        returns a filter accepting only books with specific tags.
-        """
-        q = BooleanQuery()
-
-        for tag in tags:
-            toks = self.search.get_tokens(tag.name, field=field)
-            tag_phrase = PhraseQuery()
-            for tok in toks:
-                tag_phrase.add(Term(field, tok))
-            q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
-
-        return QueryWrapperFilter(q)
-
-    def book_filter(self):
-        """
-        Filters using book tags (all tag kinds except a theme)
-        """
-        tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
-        if tags:
-            return self.tag_filter(tags)
-        else:
+    def snippet_revision(self, idx=0):
+        try:
+            return self.hits[idx]['snippets_revision']
+        except:
              return None
  
              return None
  
-    def part_filter(self):
-        """
-        This filter can be used to look for book parts.
-        It filters on book id and/or themes.
-        """
-        fs = []
-        if self.part_tags:
-            fs.append(self.tag_filter(self.part_tags, field='themes'))
-
-        if self._books != []:
-            bf = BooleanFilter()
-            for b in self._books:
-                id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
-                bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
-            fs.append(bf)
-
-        return Search.chain_filters(fs)
-
-    def should_search_for_book(self):
-        return self._books == []
-
-    def just_search_in(self, all):
-        """Holds logic to figure out which indexes should be search, when we have some hinst already"""
-        some = []
-        for field in all:
-            if field == 'authors' and 'author' in self.book_tags:
-                continue
-            if field == 'title' and self._books != []:
-                continue
-            if (field == 'themes' or field == 'themes_pl') and self.part_tags:
-                continue
-            some.append(field)
-        return some
-
  
  class Search(SolrIndex):
      """
      Search facilities.
      """
      def __init__(self, default_field="text"):
  
  class Search(SolrIndex):
      """
      Search facilities.
      """
      def __init__(self, default_field="text"):
-        IndexStore.__init__(self)
-        self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
-        # self.analyzer = WLAnalyzer()
-        reader = IndexReader.open(self.store, True)
-        self.searcher = IndexSearcher(reader)
-        self.parser = QueryParser(Version.LUCENE_34, default_field,
-                                  self.analyzer)
-
-        self.parent_filter = TermsFilter()
-        self.parent_filter.addTerm(Term("is_book", "true"))
-        index_changed.connect(self.reopen)
-
-    def close(self):
-        reader = self.searcher.getIndexReader()
-        self.searcher.close()
-        reader.close()
-        super(Search, self).close()
-        index_changed.disconnect(self.reopen)
-
-    def reopen(self, **unused):
-        reader = self.searcher.getIndexReader()
-        rdr = reader.reopen()
-        if not rdr.equals(reader):
-            log.debug('Reopening index')
-            oldsearch = self.searcher
-            self.searcher = IndexSearcher(rdr)
-            oldsearch.close()
-            reader.close()
-
-    def query(self, query):
-        """Parse query in default Lucene Syntax. (for humans)
-        """
-        return self.parser.parse(query)
-
-    def simple_search(self, query, max_results=50):
-        """Runs a query for books using lucene syntax. (for humans)
-        Returns (books, total_hits)
-        """
-
-        tops = self.searcher.search(self.query(query), max_results)
-        bks = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-        return (bks, tops.totalHits)
-
-    def get_tokens(self, searched, field='text', cached=None):
-        """returns tokens analyzed by a proper (for a field) analyzer
-        argument can be: StringReader, string/unicode, or tokens. In the last case
-        they will just be returned (so we can reuse tokens, if we don't change the analyzer)
-        """
-        if cached is not None and field in cached:
-            return cached[field]
-
-        if isinstance(searched, str) or isinstance(searched, unicode):
-            searched = StringReader(searched)
-        elif isinstance(searched, list):
-            return searched
-
-        searched.reset()
-        tokens = self.analyzer.reusableTokenStream(field, searched)
-        toks = []
-        while tokens.incrementToken():
-            cta = tokens.getAttribute(CharTermAttribute.class_)
-            toks.append(cta.toString())
-
-        if cached is not None:
-            cached[field] = toks
-
-        return toks
-
-    @staticmethod
-    def fuzziness(fuzzy):
-        """Helper method to sanitize fuzziness"""
-        if not fuzzy:
-            return None
-        if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
-            return fuzzy
-        else:
-            return 0.5
-
-    def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
-        """
-        Return a PhraseQuery with a series of tokens.
-        """
-        if fuzzy:
-            phrase = MultiPhraseQuery()
-            for t in tokens:
-                term = Term(field, t)
-                fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
-                fuzzterms = []
+        super(Search, self).__init__(mode='r')
  
  
-                while True:
-                    ft = fuzzterm.term()
-                    if ft:
-                        fuzzterms.append(ft)
-                    if not fuzzterm.next(): break
-                if fuzzterms:
-                    phrase.add(JArray('object')(fuzzterms, Term))
-                else:
-                    phrase.add(term)
-        else:
-            phrase = PhraseQuery()
-            phrase.setSlop(slop)
-            for t in tokens:
-                term = Term(field, t)
-                phrase.add(term)
-        return phrase
  
  
-    @staticmethod
-    def make_term_query(tokens, field='text', modal='BooleanClause.Occur.SHOULD XXX', fuzzy=False):
+    def make_term_query(self, query, field='text', modal=operator.or_):
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
          fuzzy - should the query by fuzzy.
          """
          """
          Returns term queries joined by boolean query.
          modal - applies to boolean query
          fuzzy - should the query by fuzzy.
          """
-        q = BooleanQuery()
-        for t in tokens:
-            term = Term(field, t)
-            if fuzzy:
-                term = FuzzyQuery(term, self.fuzziness(fuzzy))
-            else:
-                term = TermQuery(term)
-            q.add(BooleanClause(term, modal))
+        if query is None: query = ''
+        q = self.index.Q()
+        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
+                        query.split(r" ")), q)
+
          return q
  
          return q
  
-    def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
-                      filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
+    def search_phrase(self, searched, field='text', book=False,
+                      filters=None,
+                      snippets=False):
          if filters is None: filters = []
          if filters is None: filters = []
-        if tokens_cache is None: tokens_cache = {}
+        if book: filters.append(self.index.Q(is_book=True))
  
  
-        tokens = self.get_tokens(searched, field, cached=tokens_cache)
+        q = self.index.query(**{field: searched})
+        q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
+        res = q.execute()
+        return [SearchResult(found, how_found=u'search_phrase') for found in res]
  
  
-        query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
-        if book:
-            filters.append(self.term_filter(Term('is_book', 'true')))
-        top = self.searcher.search(query, self.chain_filters(filters), max_results)
-
-        return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
-
-    def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
-                    filters=None, tokens_cache=None, boost=None, snippets=True):
+    def search_some(self, searched, fields, book=True,
+                    filters=None, snippets=True, query_terms=None):
+        assert isinstance(fields, list)
          if filters is None: filters = []
          if filters is None: filters = []
-        if tokens_cache is None: tokens_cache = {}
-
-        if book:
-            filters.append(self.term_filter(Term('is_book', 'true')))
+        if book: filters.append(self.index.Q(is_book=True))
  
  
-        query = BooleanQuery()
+        query = self.index.Q()
  
          for fld in fields:
  
          for fld in fields:
-            tokens = self.get_tokens(searched, fld, cached=tokens_cache)
+            query = self.index.Q(query | self.make_term_query(searched, fld))
  
  
-            query.add(BooleanClause(self.make_term_query(tokens, field=fld,
-                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+        query = self.index.query(query)
+        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
+        res = query.execute()
+        return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
  
  
-        top = self.searcher.search(query, self.chain_filters(filters), max_results)
  
  
-        return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
-                             snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
-
-    def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
-        """
-        Search for perfect book matches. Just see if the query matches with some author or title,
-        taking hints into account.
-        """
-        fields_to_search = ['authors', 'title']
-        only_in = None
-        if hint:
-            if not hint.should_search_for_book():
-                return []
-            fields_to_search = hint.just_search_in(fields_to_search)
-            only_in = hint.book_filter()
-
-        qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
-        books = []
-        for q in qrys:
-            top = self.searcher.search(q,
-                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-                max_results)
-            for found in top.scoreDocs:
-                books.append(SearchResult(self, found, how_found="search_perfect_book"))
-        return books
-
-    def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
-        fields_to_search = ['tags', 'authors', 'title']
-
-        only_in = None
-        if hint:
-            if not hint.should_search_for_book():
-                return []
-            fields_to_search = hint.just_search_in(fields_to_search)
-            only_in = hint.book_filter()
-
-        tokens = self.get_tokens(searched, field='SIMPLE')
-
-        q = BooleanQuery()
-
-        for fld in fields_to_search:
-            q.add(BooleanClause(self.make_term_query(tokens, field=fld,
-                                fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-        books = []
-        top = self.searcher.search(q,
-                                   self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-            max_results)
-        for found in top.scoreDocs:
-            books.append(SearchResult(self, found, how_found="search_book"))
-
-        return books
-
-    def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
-        """
-        Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
-        some part/fragment of the book.
-        """
-        qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
-
-        flt = None
-        if hint:
-            flt = hint.part_filter()
-
-        books = []
-        for q in qrys:
-            top = self.searcher.search(q,
-                                       self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
-                                                           flt]),
-                                       max_results)
-            for found in top.scoreDocs:
-                books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
-        return books
-
-    def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
+    def search_everywhere(self, searched, query_terms=None):
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
          are some words from third chapter.
          """
          """
          Tries to use search terms to match different fields of book (or its parts).
          E.g. one word can be an author survey, another be a part of the title, and the rest
          are some words from third chapter.
          """
-        if tokens_cache is None: tokens_cache = {}
          books = []
          books = []
-        only_in = None
-
-        if hint:
-            only_in = hint.part_filter()
-
          # content only query : themes x content
          # content only query : themes x content
-        q = BooleanQuery()
-
-        tokens_pl = self.get_tokens(searched, field='text', cached=tokens_cache)
-        tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
-
-        # only search in themes when we do not already filter by themes
-        if hint is None or hint.just_search_in(['themes']) != []:
-            q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
-                                                     fuzzy=fuzzy), BooleanClause.Occur.MUST))
+        q = self.make_term_query(searched, 'text')
+        q_themes = self.make_term_query(searched, 'themes_pl')
  
  
-        q.add(BooleanClause(self.make_term_query(tokens_pl, field='text',
-                                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+        query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
+        res = query.execute()
  
  
-        topDocs = self.searcher.search(q, only_in, max_results)
-        for found in topDocs.scoreDocs:
-            books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
+        for found in res:
+            books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
  
          # query themes/content x author/title/tags
  
          # query themes/content x author/title/tags
-        q = BooleanQuery()
-        in_content = BooleanQuery()
-        in_meta = BooleanQuery()
+        in_content = self.index.Q()
+        in_meta = self.index.Q()
  
          for fld in ['themes_pl', 'text']:
  
          for fld in ['themes_pl', 'text']:
-            in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+            in_content |= self.make_term_query(searched, field=fld)
  
          for fld in ['tags', 'authors', 'title']:
  
          for fld in ['tags', 'authors', 'title']:
-            in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+            in_meta |= self.make_term_query(searched, field=fld)
  
  
-        q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
-        q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
+        q = in_content & in_meta
+        res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
  
  
-        topDocs = self.searcher.search(q, only_in, max_results)
-        for found in topDocs.scoreDocs:
-            books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
+        for found in res:
+            books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
  
          return books
  
  
          return books
  
-    # def multisearch(self, query, max_results=50):
-    #     """
-    #     Search strategy:
-    #     - (phrase) OR -> content
-    #                   -> title
-    #                   -> authors
-    #     - (keywords)  -> authors
-    #                   -> motyw
-    #                   -> tags
-    #                   -> content
-    #     """
-        # queryreader = StringReader(query)
-        # tokens = self.get_tokens(queryreader)
-
-        # top_level = BooleanQuery()
-        # Should = BooleanClause.Occur.SHOULD
-
-        # phrase_level = BooleanQuery()
-        # phrase_level.setBoost(1.3)
-
-        # p_content = self.make_phrase(tokens, joined=True)
-        # p_title = self.make_phrase(tokens, 'title')
-        # p_author = self.make_phrase(tokens, 'author')
-
-        # phrase_level.add(BooleanClause(p_content, Should))
-        # phrase_level.add(BooleanClause(p_title, Should))
-        # phrase_level.add(BooleanClause(p_author, Should))
-
-        # kw_level = BooleanQuery()
-
-        # kw_level.add(self.make_term_query(tokens, 'author'), Should)
-        # j_themes = self.make_term_query(tokens, 'themes', joined=True)
-        # kw_level.add(j_themes, Should)
-        # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
-        # j_con = self.make_term_query(tokens, joined=True)
-        # kw_level.add(j_con, Should)
-
-        # top_level.add(BooleanClause(phrase_level, Should))
-        # top_level.add(BooleanClause(kw_level, Should))
-
-        # return None
-
-    def get_snippets(self, scoreDoc, query, field='text'):
+    def get_snippets(self, searchresult, query, field='text', num=1):
          """
          Returns a snippet for found scoreDoc.
          """
          """
          Returns a snippet for found scoreDoc.
          """
-        htmlFormatter = SimpleHTMLFormatter()
-        highlighter = Highlighter(htmlFormatter, QueryScorer(query))
-
-        stored = self.searcher.doc(scoreDoc.doc)
-
-        position = stored.get('snippets_position')
-        length = stored.get('snippets_length')
-        if position is None or length is None:
-            return None
-        revision = stored.get('snippets_revision')
-        if revision: revision = int(revision)
-
-        # locate content.
-        book_id = int(stored.get('book_id'))
+        maxnum = len(searchresult)
+        if num is None or num < 0 or num > maxnum:
+            num = maxnum
+        book_id = searchresult.book_id
+        revision = searchresult.snippet_revision()
          snippets = Snippets(book_id, revision=revision)
          snippets = Snippets(book_id, revision=revision)
-
+        snips = [None] * maxnum
          try:
              snippets.open()
          try:
              snippets.open()
+            idx = 0
+            while idx < maxnum and num > 0:
+                position, length = searchresult.snippet_pos(idx)
+                if position is None or length is None:
+                    continue
+                text = snippets.get((int(position),
+                                     int(length)))
+                snip = self.index.highlight(text=text, field=field, q=query)
+                snips[idx] = snip
+                if snip:
+                    num -= 1
+                idx += 1
+
          except IOError, e:
          except IOError, e:
-            log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+            log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
              return []
              return []
+        finally:
+            snippets.close()
  
  
-        try:
-            try:
-                text = snippets.get((int(position),
-                                     int(length)))
-            finally:
-                snippets.close()
+            # remove verse end markers..
+        snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
  
  
-            tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
-            #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
-            snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+        searchresult.snippets = snips
  
  
-        except Exception, e:
-            e2 = e
-            if hasattr(e, 'getJavaException'):
-                e2 = unicode(e.getJavaException())
-            raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
-                e2)
-        return snip
+        return snips
  
  
-    @staticmethod
-    def enum_to_array(enum):
+    def hint_tags(self, query, pdcounter=True, prefix=True):
          """
          """
-        Converts a lucene TermEnum to array of Terms, suitable for
-        addition to queries
+        Return auto-complete hints for tags
+        using prefix search.
          """
          """
-        terms = []
-
-        while True:
-            t = enum.term()
-            if t:
-                terms.append(t)
-            if not enum.next(): break
+        q = self.index.Q()
+        query = query.strip()
+        for field in ['tag_name', 'tag_name_pl']:
+            if prefix:
+                q |= self.index.Q(**{field: query + "*"})
+            else:
+                q |= self.make_term_query(query, field=field)
+        qu = self.index.query(q).exclude(tag_category="book")
  
  
-        if terms:
-            return JArray('object')(terms, Term)
+        return self.search_tags(qu, pdcounter=pdcounter)
  
  
-    def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
+    def search_tags(self, query, filters=None, pdcounter=False):
          """
          Search for Tag objects using query.
          """
          """
          Search for Tag objects using query.
          """
+        if not filters: filters = []
          if not pdcounter:
          if not pdcounter:
-            filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
-        tops = self.searcher.search(query, filt, max_results)
+            filters.append(~self.index.Q(is_pdcounter=True))
+        res = self.apply_filters(query, filters).execute()
  
          tags = []
  
          tags = []
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            is_pdcounter = doc.get('is_pdcounter')
+        pd_tags = []
+
+        for doc in res:
+            is_pdcounter = doc.get('is_pdcounter', False)
              category = doc.get('tag_category')
              try:
              category = doc.get('tag_category')
              try:
-                if is_pdcounter == 'true':
+                if is_pdcounter == True:
                      if category == 'pd_author':
                          tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
                      elif category == 'pd_book':
                          tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
                          tag.category = 'pd_book'  # make it look more lik a tag.
                      else:
                      if category == 'pd_author':
                          tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
                      elif category == 'pd_book':
                          tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
                          tag.category = 'pd_book'  # make it look more lik a tag.
                      else:
-                        print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
+                        print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+                    pd_tags.append(tag)
                  else:
                      tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
                  else:
                      tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
-                    # don't add the pdcounter tag if same tag already exists
-
-                tags.append(tag)
+                    tags.append(tag)
  
              except catalogue.models.Tag.DoesNotExist: pass
              except PDCounterAuthor.DoesNotExist: pass
              except PDCounterBook.DoesNotExist: pass
  
  
              except catalogue.models.Tag.DoesNotExist: pass
              except PDCounterAuthor.DoesNotExist: pass
              except PDCounterBook.DoesNotExist: pass
  
+        tags_slugs = set(map(lambda t: t.slug, tags))
+        tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
+
          log.debug('search_tags: %s' % tags)
  
          return tags
  
          log.debug('search_tags: %s' % tags)
  
          return tags
  
-    def search_books(self, query, filt=None, max_results=10):
-        """
-        Searches for Book objects using query
-        """
-        bks = []
-        tops = self.searcher.search(query, filt, max_results)
-        for found in tops.scoreDocs:
-            doc = self.searcher.doc(found.doc)
-            try:
-                bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
-            except catalogue.models.Book.DoesNotExist: pass
-        return bks
-
-    def make_prefix_phrase(self, toks, field):
-        q = MultiPhraseQuery()
-        for i in range(len(toks)):
-            t = Term(field, toks[i])
-            if i == len(toks) - 1:
-                pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
-                if pterms:
-                    q.add(pterms)
-                else:
-                    q.add(t)
-            else:
-                q.add(t)
-        return q
-
-    @staticmethod
-    def term_filter(term, inverse=False):
-        only_term = TermsFilter()
-        only_term.addTerm(term)
-
-        if inverse:
-            neg = BooleanFilter()
-            neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
-            only_term = neg
-
-        return only_term
-
-    def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
-        """
-        Return auto-complete hints for tags
-        using prefix search.
-        """
-        toks = self.get_tokens(string, field='SIMPLE')
-        top = BooleanQuery()
-
-        for field in ['tag_name', 'tag_name_pl']:
-            if prefix:
-                q = self.make_prefix_phrase(toks, field)
-            else:
-                q = self.make_term_query(toks, field, fuzzy=fuzzy)
-            top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
-
-        no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
-
-        return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
-
-    def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
+    def hint_books(self, query, prefix=True):
          """
          Returns auto-complete hints for book titles
          Because we do not index 'pseudo' title-tags.
          Prefix search.
          """
          """
          Returns auto-complete hints for book titles
          Because we do not index 'pseudo' title-tags.
          Prefix search.
          """
-        toks = self.get_tokens(string, field='SIMPLE')
-
+        q = self.index.Q()
+        query = query.strip()
          if prefix:
          if prefix:
-            q = self.make_prefix_phrase(toks, 'title')
+            q |= self.index.Q(title=query + "*")
          else:
          else:
-            q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
-
-        return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
+            q |= self.make_term_query(query, field='title')
+        qu = self.index.query(q)
+        only_books = self.index.Q(is_book=True)
+        return self.search_books(qu, [only_books])
  
  
-    @staticmethod
-    def chain_filters(filters, op='XXXChainedFilter.AND'):
+    def search_books(self, query, filters=None, max_results=10):
          """
          """
-        Chains a filter list together
+        Searches for Book objects using query
          """
          """
-        filters = filter(lambda x: x is not None, filters)
-        if not filters or filters is []:
-            return None
-        chf = ChainedFilter(JArray('object')(filters, Filter), op)
-        return chf
+        bks = []
+        bks_found = set()
+        query = query.query(is_book=True)
+        res = self.apply_filters(query, filters).field_limit(['book_id'])
+        for r in res:
+            try:
+                bid = r['book_id']
+                if not bid in bks_found:
+                    bks.append(catalogue.models.Book.objects.get(id=bid))
+                    bks_found.add(bid)
+            except catalogue.models.Book.DoesNotExist: pass
+        return bks
+ 
  
  
-    def filtered_categories(self, tags):
+    @staticmethod
+    def apply_filters(query, filters):
          """
          """
-        Return a list of tag categories, present in tags list.
+        Apply filters to a query
          """
          """
-        cats = {}
-        for t in tags:
-            cats[t.category] = True
-        return cats.keys()
-
-    def hint(self):
-        return Hint(self)
+        if filters is None: filters = []
+        filters = filter(lambda x: x is not None, filters)
+        for f in filters:
+            query = query.query(f)
+        return query